1 - Load Data

In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy

trainFeatures = pd.read_csv(open("train_features.tsv", encoding="utf8"), sep = '\t')
trainLabels = pd.read_csv(open("train_labels.tsv", encoding="utf8"), sep = '\t')
testFeatures = pd.read_csv(open("test_features.tsv", encoding="utf8"), sep = '\t')

trainFeat = deepcopy(trainFeatures)
trainLab = deepcopy(trainLabels)
testFeat = deepcopy(testFeatures)

trainSet = trainFeat.merge(trainLab, on='movieId', how = 'left')

2 - Clean Data

In [2]:
trainSet1 = trainSet.dropna(how = 'any')
trainSet1.reset_index(drop = True, inplace = True)
trainSet1['year'].replace(to_replace = '2010) ', value = '2010', inplace = True)
trainSet1['year'].replace(to_replace = '2005) ', value = '2005', inplace = True)
trainSet1['year'].replace(to_replace = '2013) ', value = '2013', inplace = True)
trainSet1['year'].replace(to_replace = '2011) ', value = '2011', inplace = True)
trainSet1['year'].replace(to_replace = '2012) ', value = '2012', inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


3 - Preprocessing

3.1 - Vectorize words using Tfidf

In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')

#train features - tags

newTagTrain = []

for i in range(0, 5237):
    tag = re.sub('[^a-zA-Z]', ' ', trainSet1['tag'][i])
    tag = tag.lower()
    tag = tag.split()
    ps = PorterStemmer()
    tag = [ps.stem(word) for word in tag if not word in set(stopwords.words('english'))]
    tag = ' '.join(tag)
    newTagTrain.append(tag)
    
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features = 1500)
tagTrainMatrix = cv.fit_transform(newTagTrain).toarray()
tagTrainDf = pd.DataFrame(tagTrainMatrix)
tagTrainDf.columns = cv.get_feature_names()

tagTrainDf = tagTrainDf.rename(columns = lambda x: x + "_ta")

#train features - titles

newTitleTrain = []

for i in range(0, 5237):
    title = re.sub('[^a-zA-Z]', ' ', trainSet1['title'][i])
    title = title.lower()
    title = title.split()
    ps = PorterStemmer()
    title = [ps.stem(word) for word in title if not word in set(stopwords.words('english'))]
    title = ' '.join(title)
    newTitleTrain.append(title)
    
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features = 1500)
titleTrainMatrix = cv.fit_transform(newTitleTrain).toarray()
titleTrainDf = pd.DataFrame(titleTrainMatrix)
titleTrainDf.columns = cv.get_feature_names()

titleTrainDf = titleTrainDf.rename(columns = lambda x: x + "_ti")

#test features - tags

newTagTest = []

for i in range(0, 235):
    tag = re.sub('[^a-zA-Z]', ' ', testFeat['tag'][i])
    tag = tag.lower()
    tag = tag.split()
    ps = PorterStemmer()
    tag = [ps.stem(word) for word in tag if not word in set(stopwords.words('english'))]
    tag = ' '.join(tag)
    newTagTest.append(tag)
    
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features = 1500)
tagTestMatrix = cv.fit_transform(newTagTest).toarray()
tagTestDf = pd.DataFrame(tagTestMatrix)
tagTestDf.columns = cv.get_feature_names()

tagTestDf = tagTestDf.rename(columns = lambda x: x + "_ta")

#test features - titles

newTitleTest = []

for i in range(0, 235):
    title = re.sub('[^a-zA-Z]', ' ', testFeat['title'][i])
    title = title.lower()
    title = title.split()
    ps = PorterStemmer()
    title = [ps.stem(word) for word in title if not word in set(stopwords.words('english'))]
    title = ' '.join(title)
    newTitleTest.append(title)
    
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features = 1500)
titleTestMatrix = cv.fit_transform(newTitleTest).toarray()
titleTestDf = pd.DataFrame(titleTestMatrix)
titleTestDf.columns = cv.get_feature_names()

titleTestDf = titleTestDf.rename(columns = lambda x: x + "_ti")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\diego\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


3.2 One hot encoder for years

In [5]:
yearTr = trainSet1['year']
yearAuxTr = pd.DataFrame(yearTr)
yearsTrain = pd.get_dummies(yearAuxTr, drop_first = True)

yearTe = testFeat['year']
yearAuxTe = pd.DataFrame(yearTe)
yearAuxTe = yearAuxTe.astype(str)
yearsTest = pd.get_dummies(yearAuxTe, drop_first = True)

3.3 - Scale data and match columns

In [6]:
# drop columns movieId - YTId - title - tag - year
trainSet2 = deepcopy(trainSet1)
trainSet2.drop(columns = ['movieId', 'title', 'YTId', 'tag', 'year'], axis = 1, inplace = True)

testFeat1 = deepcopy(testFeat)
testFeat1.drop(columns = ['movieId', 'title', 'YTId', 'tag', 'year'], axis = 1, inplace = True)

In [8]:
# check dimensions
y_train = trainSet2.iloc[:, 127].values
XAuxTrain = trainSet2.iloc[:, 0:127]

# check dimensions
XAuxTest = testFeat1.iloc[:, 0:127]

# Feature Scaling - we use tfidf raw features
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

XAuxTrain_yearsTrain = pd.concat([XAuxTrain, yearsTrain], axis = 1)
XAuxTest_yearsTest = pd.concat([XAuxTest, yearsTest], axis = 1)
XAuxTrain_yearsTrain1, XAuxTest_yearsTest1 = XAuxTrain_yearsTrain.align(XAuxTest_yearsTest, join = 'outer', axis = 1, fill_value = 0)

XAuxTrain_yearsTrain_Sc = pd.DataFrame(ss.fit_transform(XAuxTrain_yearsTrain1),columns = XAuxTrain_yearsTrain1.columns)
XAuxTest_yearsTest_Sc = pd.DataFrame(ss.transform(XAuxTest_yearsTest1),columns = XAuxTest_yearsTest1.columns)

In [9]:
X_train = pd.concat([XAuxTrain_yearsTrain_Sc, tagTrainDf, titleTrainDf], axis = 1)

In [10]:
X_test = pd.concat([XAuxTest_yearsTest_Sc, tagTestDf, titleTestDf], axis = 1)

In [11]:
X_train1, X_test1 = X_train.align(X_test, join = 'outer', axis = 1, fill_value = 0)

In [12]:
# Save the values
X_train2 = deepcopy(X_train1)
X_test2 = deepcopy(X_test1)

4 - Feature selection (All features)

In [13]:
#anova test
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func = f_classif, k = 2000)
fit = bestfeatures.fit(X_train2, y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train2.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs', 'Score']  #naming the dataframe columns
print(featureScores.nlargest(20, 'Score'))  #print 20 best features

               Specs       Score
2019          war_ta  118.662076
1361         noir_ta   80.839123
526   documentari_ta   73.705935
649            fi_ta   60.914879
1705          sci_ta   60.914879
67           anim_ta   54.865338
916            ii_ta   48.883634
2080        world_ta   48.883634
43          alien_ta   42.873215
1308        music_ta   42.627156
190             avf7   38.646457
629       fantasi_ta   38.443335
1502        polit_ta   33.574243
150            avf33   33.176945
179             avf6   31.645255
199            avf78   30.259596
172            avf53   30.218690
146             avf3   29.632619
522        disney_ta   27.233223
205            avf83   27.196010


  265  273  277  300  308  314  318  319  320  324  342  343  348  350
  353  357  358  364  371  373  376  377  378  385  391  395  411  413
  416  417  430  432  434  447  462  469  483  490  493  506  514  517
  518  524  530  540  541  563  565  566  567  569  570  572  577  583
  588  596  598  603  605  607  622  633  642  646  661  669  690  692
  709  719  731  732  749  767  772  773  784  786  788  802  810  811
  825  827  831  846  851  853  859  864  881  900  904  914  919  925
  933  940  945  955  957  958  990  994 1001 1002 1004 1021 1036 1038
 1040 1050 1051 1057 1058 1075 1077 1096 1102 1128 1133 1136 1144 1173
 1179 1182 1190 1198 1205 1225 1236 1242 1268 1270 1278 1287 1295 1306
 1316 1318 1320 1335 1336 1339 1348 1367 1378 1396 1406 1418 1428 1483
 1484 1515 1518 1529 1556 1562 1570 1587 1588 1597 1613 1625 1629 1653
 1654 1656 1670 1672 1677 1685 1687 1707 1722 1725 1739 1751 1755 1758
 1759 1779 1788 1804 1810 1814 1818 1821 1826 1828 1837 1847 1851 1857
 1864 

In [15]:
bestFeatures = featureScores.nlargest(200,'Score')
X_train3 = X_train2[bestFeatures['Specs']]
X_test3 = X_test2[bestFeatures['Specs']]

5 - Logistic Regression Upload

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

c = LogisticRegression(multi_class = 'multinomial', max_iter = 100000, solver = 'newton-cg') # class_weight = 'balanced', solver = 'newton-cg'
c.fit(X_train3, y_train)
y_pred_LR = c.predict(X_test3)

movieId = testFeat['movieId'].to_numpy()
kaggle = pd.DataFrame(data = [movieId, y_pred_LR]).T
kaggle.to_csv('kaggle992038_LR.csv', header = ['movieId', 'genres'], index = False)

6 - XGBoost Upload

In [18]:
from xgboost import XGBClassifier
from sklearn import metrics

c = XGBClassifier(learning_rate = 0.01, n_estimators = 1000, max_depth = 3, subsample = 0.8, colsample_bytree = 1, gamma = 1)
c.fit(X_train3, y_train)
y_pred_XGB = c.predict(X_test3)

movieId = testFeat['movieId'].to_numpy()
kaggle = pd.DataFrame(data = [movieId, y_pred_XGB]).T
kaggle.to_csv('kaggle992038_XGB.csv', header = ['movieId', 'genres'], index = False)