In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

#load the dataset
data = pd.read_csv('troop_movements10m.csv')

#display first few rows
print("first few rows of the dataframe:")
print(data.head())

first few rows of the dataframe:
             timestamp  unit_id     unit_type  location_x  location_y  \
0  2023-06-13 17:33:18        1         at-st         2.0         8.0   
1  2023-06-13 17:33:17        2  tie_silencer         4.0         4.0   
2  2023-06-13 17:33:16        3         at-at         0.0         3.0   
3  2023-06-13 17:33:15        4  tie_silencer         6.0         1.0   
4  2023-06-13 17:33:14        5   tie_fighter         0.0         4.0   

   destination_x  destination_y    homeworld  
0              1              1  Glee Anselm  
1              0              1    Trandosha  
2              6              1     Corellia  
3              6              9        Shili  
4              9              6   Muunilinst  


In [13]:
data['unit_type'].unique()

array(['at-st', 'tie_silencer', 'at-at', 'tie_fighter', 'stormtrooper',
       'x-wing', 'resistance_soldier', 'invalid_unit'], dtype=object)

In [14]:
data['unit_type'].replace('invalid_unit', 'unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['unit_type'].replace('invalid_unit', 'unknown', inplace=True)


In [15]:
data['location_x'].unique()

array([ 2.,  4.,  0.,  6.,  7.,  8.,  3.,  9.,  5.,  1., nan])

In [16]:
data['location_x'].ffill(inplace=True)
data['location_y'].ffill(inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['location_x'].ffill(inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['location_y'].ffill(inplace=True)


In [17]:
data.to_parquet('troop_movements10m.parquet', engine='pyarrow')

In [18]:
df = pd.read_parquet('troop_movements10m.parquet')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 8 columns):
 #   Column         Dtype  
---  ------         -----  
 0   timestamp      object 
 1   unit_id        int64  
 2   unit_type      object 
 3   location_x     float64
 4   location_y     float64
 5   destination_x  int64  
 6   destination_y  int64  
 7   homeworld      object 
dtypes: float64(2), int64(3), object(3)
memory usage: 610.4+ MB


In [19]:
import pickle
with open('starwars_model.pkl', 'rb') as f:
    predictions = pickle.load(f)

In [23]:
# Get feature importances 
columns_to_encode = ['unit_type', 'homeworld']

df_encoded = pd.get_dummies(df[columns_to_encode], prefix=columns_to_encode)

df = pd.concat([df, df_encoded.astype(bool)], axis=1)

df = df.drop(columns=columns_to_encode)

MemoryError: Unable to allocate 477. MiB for an array with shape (50, 10000000) and data type bool

In [11]:
df.columns

Index(['timestamp', 'unit_id', 'location_x', 'location_y', 'destination_x',
       'destination_y', 'unit_type_Unknown', 'unit_type_at-at',
       'unit_type_at-st', 'unit_type_resistance_soldier',
       'unit_type_stormtrooper', 'unit_type_tie_fighter',
       'unit_type_tie_silencer', 'unit_type_x-wing', 'homeworld_Alderaan',
       'homeworld_Aleen Minor', 'homeworld_Bestine IV', 'homeworld_Cerea',
       'homeworld_Champala', 'homeworld_Chandrila', 'homeworld_Concord Dawn',
       'homeworld_Corellia', 'homeworld_Dagobah', 'homeworld_Dathomir',
       'homeworld_Dorin', 'homeworld_Eriadu', 'homeworld_Glee Anselm',
       'homeworld_Haruun Kal', 'homeworld_Iktotch', 'homeworld_Iridonia',
       'homeworld_Kalee', 'homeworld_Kashyyyk', 'homeworld_Malastare',
       'homeworld_Mirial', 'homeworld_Mon Cala', 'homeworld_Muunilinst',
       'homeworld_Naboo', 'homeworld_Ojom', 'homeworld_Quermia',
       'homeworld_Rodia', 'homeworld_Ryloth', 'homeworld_Serenno',
       'homeworld_Shili

In [10]:
X = pd.concat([data.iloc[:, 8:]], axis=1)
df['predictions'] = predictions.predict(df[8:])
y = df['predictions']

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- destination_x
- destination_y
- location_x
- location_y
- timestamp
- ...
Feature names seen at fit time, yet now missing:
- unit_type_unknown


In [50]:
df.head()

Unnamed: 0,timestamp,unit_id,unit_type,location_x,location_y,destination_x,destination_y,homeworld,predictions
0,2023-06-13 17:33:18,1,at-st,2.0,8.0,1,1,Glee Anselm,<bound method BaseDecisionTree.predict of Deci...
1,2023-06-13 17:33:17,2,tie_silencer,4.0,4.0,0,1,Trandosha,<bound method BaseDecisionTree.predict of Deci...
2,2023-06-13 17:33:16,3,at-at,0.0,3.0,6,1,Corellia,<bound method BaseDecisionTree.predict of Deci...
3,2023-06-13 17:33:15,4,tie_silencer,6.0,1.0,6,9,Shili,<bound method BaseDecisionTree.predict of Deci...
4,2023-06-13 17:33:14,5,tie_fighter,0.0,4.0,9,6,Muunilinst,<bound method BaseDecisionTree.predict of Deci...


In [43]:
X = pd.concat([df.iloc[:, 8:]], axis=1)
y = df['is_resistance']

KeyError: 'is_resistance'

In [None]:
X = pd.concat([data.iloc[:, 8:]], axis=1)
y = data['is_resistance']
# split data 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Create Decision Tree Classifier
model = DecisionTreeClassifier()

# fit the model
model.fit(X_train, y_train)

# make predictions on test set
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [40]:
import pickle
loaded_model = pickle.load(open('starwars_model.pkl', 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)

NameError: name 'X_test' is not defined