In [1]:
import pandas as pd
import numpy as np
from script import DataManager

from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

dm = DataManager()

#### We now have access to several DataFrames - after running the cell above:

- `dm.train_df`  
    all *train-*.csv* files concatenated into 1 **DataFrame**
- `dm.writing_df`  
    the *writing.json* file parsed into a **DataFrame**
- `dm.directing_df`  
    the *directing.json* file parsed into a **DataFrame**
- `dm.joined_df`   
    result of `writing_df` left-joined with `directing_df` on column *movie*)
- `df.validation_df`  
    the *validation_hidden.csv* file as a **DataFrame**
- `dm.test_df`  
    the *test_hidden.csv* file as a **DataFrame**

In [2]:
import requests
url = "http://www.omdbapi.com/?i=tt0010600&apikey=<>"
res = requests.get(url)
print(res.content)

b'{"Response":"False","Error":"Invalid API key!"}'


In [9]:
dm.train_df.head()

Unnamed: 0_level_0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label,special_chars
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,tt0009369,Mickey,Mickey,1918,\N,93,1119.0,False,
4,tt0010600,The Doll,Die Puppe,1919,\N,66,1898.0,True,
5,tt0011439,The Mark of Zorro,The Mark of Zorro,1920,\N,79,2439.0,True,
6,tt0011607,The Parson's Widow,Prästänkan,1920,\N,94,1264.0,True,
7,tt0011841,Way Down East,Way Down East,1920,\N,145,5376.0,True,


# Imputation

In [10]:
dm.train_df.isna().sum()

tconst            0
primaryTitle      0
originalTitle     0
startYear         0
endYear           0
runtimeMinutes    0
numVotes          0
label             0
special_chars     0
dtype: int64

In [11]:
dm.train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7959 entries, 2 to 9999
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          7959 non-null   object
 1   primaryTitle    7959 non-null   object
 2   originalTitle   7959 non-null   object
 3   startYear       7959 non-null   object
 4   endYear         7959 non-null   object
 5   runtimeMinutes  7959 non-null   object
 6   numVotes        7959 non-null   object
 7   label           7959 non-null   bool  
 8   special_chars   7959 non-null   object
dtypes: bool(1), object(8)
memory usage: 567.4+ KB


# Imputation

In [17]:
dm.train_df["endYear"].replace({"\\N": 0}, inplace=True)
dm.train_df["startYear"].replace({"\\N": 0}, inplace=True)
dm.train_df["runtimeMinutes"].replace({"\\N": 0}, inplace=True)
dm.train_df["numVotes"].replace({"NULL": np.nan}, inplace=True)

dm.train_df["startYear"] = pd.to_numeric(dm.train_df["startYear"])
dm.train_df["endYear"] = pd.to_numeric(dm.train_df["endYear"])
dm.train_df["label"] = dm.train_df["label"].astype(int)
dm.train_df["runtimeMinutes"] = pd.to_numeric(dm.train_df["runtimeMinutes"])

In [18]:
df = dm.train_df.drop(['tconst', 'primaryTitle', 'originalTitle', 'special_chars'], axis=1)

In [25]:
df.head()

Unnamed: 0,startYear,endYear,runtimeMinutes,numVotes,label
0,0.949035,0.0,0.168784,4.7e-05,0.0
1,0.94953,0.0,0.119782,0.000358,1.0
2,0.950025,0.0,0.143376,0.000575,1.0
3,0.950025,0.0,0.170599,0.000105,1.0
4,0.950025,0.0,0.263158,0.001748,1.0


In [20]:
scaler = MinMaxScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)
df.head(10)

Unnamed: 0,startYear,endYear,runtimeMinutes,numVotes,label
0,0.949035,0.0,0.168784,4.7e-05,0.0
1,0.94953,0.0,0.119782,0.000358,1.0
2,0.950025,0.0,0.143376,0.000575,1.0
3,0.950025,0.0,0.170599,0.000105,1.0
4,0.950025,0.0,0.263158,0.001748,1.0
5,0.95052,0.0,0.123412,0.04813,1.0
6,0.95052,0.0,0.176044,0.001934,1.0
7,0.95052,0.0,0.272232,,1.0
8,0.951014,0.0,0.165154,0.005066,1.0
9,0.951014,0.0,0.259528,0.00047,1.0


In [21]:
df.isna().any()

startYear         False
endYear           False
runtimeMinutes    False
numVotes           True
label             False
dtype: bool

In [22]:
imputer = KNNImputer(n_neighbors=5)
df = pd.DataFrame(imputer.fit_transform(df),columns = df.columns)

In [23]:
df.isna().any()

startYear         False
endYear           False
runtimeMinutes    False
numVotes          False
label             False
dtype: bool

In [24]:
df.head(10)

Unnamed: 0,startYear,endYear,runtimeMinutes,numVotes,label
0,0.949035,0.0,0.168784,4.7e-05,0.0
1,0.94953,0.0,0.119782,0.000358,1.0
2,0.950025,0.0,0.143376,0.000575,1.0
3,0.950025,0.0,0.170599,0.000105,1.0
4,0.950025,0.0,0.263158,0.001748,1.0
5,0.95052,0.0,0.123412,0.04813,1.0
6,0.95052,0.0,0.176044,0.001934,1.0
7,0.95052,0.0,0.272232,0.001573,1.0
8,0.951014,0.0,0.165154,0.005066,1.0
9,0.951014,0.0,0.259528,0.00047,1.0
