In [2]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

In [3]:
df = pd.read_csv("project_data/train_dataset.csv").dropna(subset=["language"]).reset_index()
test = pd.read_csv("project_data/test_dataset.csv")
labels = df.iloc[:,-1]
# df = df.iloc[:, :-1]


In [65]:
zeroR = [2] * test.shape[0]
print(f"The ZeroR train accuracy is {1839/3004}")

The ZeroR train accuracy is 0.6121837549933422


In [6]:
def OneR(df, col, labels):
    levels = {}

    if df.dtypes[col]=='int64':
        bins = np.histogram(df[col], bins=4)[1]
        values = pd.DataFrame({'var' :   pd.Series(np.digitize(df[col], bins)), 
                               'label' : labels})  

    else:
        if len(df[col].unique())<100:
            values = pd.DataFrame({'var' :  df[col], 'label' : labels})
        else:
            return [] # too computationally heavy if too many categories
    
    for x in values['var'].unique():
        class_ = values[values['var'] == x].label.value_counts().nlargest(1, keep='all')

        if len(class_)==0: 
            print(class_)
        
        if len(class_)>1: levels[x] = class_.sample(n=1)
        levels[x] = class_.index[0]

    pred = []
    for i in range(values.shape[0]):
        pred.append(levels[values['var'][i]])
    return(pred)

In [24]:
def OneR_fit(train, test, col):
    levels = {}
    
    # values is df with the binned values

    if train.dtypes[col]=='int64':
        bins = np.histogram(train[col], bins=4)[1]
        values = pd.DataFrame({'var' :   pd.Series(np.digitize(train[col], bins)), 
                               'label' : train['imdb_score_binned']})  

    else:
        if len(train[col].unique())<100:
            values = pd.DataFrame({'var' :  train[col], 'label' : train['imdb_score_binned']})
        else:
            return [] # too computationally heavy if too many categories
    
    for x in values['var'].unique():
        class_ = values[values['var'] == x].label.value_counts().nlargest(1, keep='all')
        
        if len(class_)>1: levels[x] = class_.sample(n=1)
        levels[x] = class_.index[0]

    # bin the test values
    test_values = pd.Series(np.digitize(test[col], bins))

    # Make predictions
    pred = []
    for i in range(len(test_values)):
        pred.append(levels[test_values[i]])
    return(pred)  

In [7]:
pred_list = {}
for i in range(1,df.shape[1]-1):
    col = df.columns[i]
    predictions = OneR(df, col, labels)
    pred_list[col] = predictions

In [8]:
scores = {}
for col in pred_list.keys():
    if len(pred_list[col])>0:
        accuracy = pd.DataFrame({'pred':pred_list[col], 'actual':labels})
        accuracy['correct'] = accuracy.actual==accuracy.pred
        score = sum(accuracy.correct)/len(accuracy.correct)
        scores[col] = score
top_score = max(scores, key=scores.get)

In [9]:
print(f"The OneR is {top_score} with train accuracy : {scores[top_score]}")

The OneR is num_voted_users with accuracy : 0.6363636363636364


OneR output

In [25]:
test_labs = OneR_fit(df, test, top_score)
kaggle = pd.DataFrame({'id':test['id'], 'imdb_score_binned': test_labs})
kaggle.to_csv("results_OneR.csv", index=False)

ZeroR output

In [32]:
kaggle0 = pd.DataFrame({'id':test['id'], 'imdb_score_binned': [2] * test.shape[0]})
kaggle0.to_csv("results_ZeroR.csv", index=False)

# Useful Code (Preprocessing)

### Genres - Fixed

In [None]:
genres = df['genres'].str.split("|")
genres = list(set([a for b in genres.tolist() for a in b])) # get all the unique genres

for x in genres:
    df[x] = df.genres.str.count(x) # this is gonna give 0 or 1 for each row and genre

df.drop(["genres"], axis=1, inplace=True)    


In [41]:
df.describe(include=[object])

Unnamed: 0,director_name,actor_2_name,genres,actor_1_name,movie_title,actor_3_name,plot_keywords,language,country,content_rating,title_embedding
count,3003,3003,3003,3003,3003,3003,3003,3003,3003,3003,3003
unique,1459,1902,675,1264,2941,2197,2942,32,42,12,2941
top,Steven Spielberg,Morgan Freeman,Comedy|Drama|Romance,Johnny Depp,King Kong,Steve Coogan,animal name in title|ape abducts a woman|goril...,English,USA,R,[-2.8369315e-03 1.3950688e-03 -3.8431014e-04 ...
freq,21,15,122,30,3,7,3,2872,2381,1362,3
