#### IMPORTS

In [69]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

#### LOADING DATA

In [70]:
# Loading the train.csv as the main dataset
data = pd.read_csv("../data/train.csv")

# Column Transformation to lowercase and underscored spaces
data.columns = data.columns.str.replace(' ', '_')
data.columns = data.columns.str.replace('-', '_')
data.columns = data.columns.str.lower()

X = data.loc[:, data.columns != 'lead']
y = data.loc[:, data.columns == 'lead']

#### SPLITTING DATA

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
[X_train.shape, X_test.shape, y_train.shape, y_test.shape]

[(779, 13), (260, 13), (779, 1), (260, 1)]

#### BUILD PIPELINE

In this section we have built a pipeline to fit a KNN model for our training dataset and produce the training and testing accuracies. Within the pipeline we are using **SandardScaler** to scale our training data and then use **KNeighborsClassifier** to fitting and prediction. The output will be in the following format.

Train Accuracy: XXXX
Test Accuracy: XXXX

In [72]:
pipe = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier(n_neighbors = 12)
)

"""all_features = [
        'number_words_female',
        'total_words',
        'number_of_words_lead',
        'difference_in_words_lead_and_co_lead',
        'number_of_male_actors',
        'year',
        'number_of_female_actors',
        'number_words_male',
        'gross',
        'mean_age_male',
        'mean_age_female',
        'age_lead',
        'age_co_lead'
    ]"""

selected_features = [
        'number_words_female',
        'total_words',
        'number_of_words_lead',
        'difference_in_words_lead_and_co_lead',
        'number_of_male_actors',
        'number_of_female_actors',
        'number_words_male',
        'mean_age_female',
        'age_lead',
        'age_co_lead'
    ]

pipe.fit(X_train[selected_features], y_train.to_numpy().reshape(-1, ))

print("Train Accuracy: " + str(pipe.score(X_train[selected_features], y_train.to_numpy().reshape(-1, ))))
print("Test Accuracy: " + str(pipe.score(X_test[selected_features], y_test.to_numpy().reshape(-1, ))))


Train Accuracy: 0.8523748395378691
Test Accuracy: 0.8076923076923077
