In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import random
import string
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
df = pd.read_csv('each_genre200.csv')

In [4]:
df.head()

Unnamed: 0,imdb_title_id,img_path,title,first_genre,description
0,tt0068334,/content/drive/MyDrive/itmo/3sem/ML for Indust...,Il candidato,Comedy,Bill McKay is a candidate for the U.S. Senate ...
1,tt1305714,/content/drive/MyDrive/itmo/3sem/ML for Indust...,Make the Yuletide Gay,Comedy,"A gay student who is ""out"" at college but not ..."
2,tt0081689,/content/drive/MyDrive/itmo/3sem/ML for Indust...,"Uno contro l'altro, praticamente amici",Comedy,An industrialist wants to corrupt a politician...
3,tt0057004,/content/drive/MyDrive/itmo/3sem/ML for Indust...,Dottore nei guai,Comedy,Dr. Simon Sparrow's (Sir Dirk Bogarde's) love ...
4,tt0042665,/content/drive/MyDrive/itmo/3sem/ML for Indust...,Last Holiday,Comedy,"When a lonely, unappreciated farm equipment sa..."


In [5]:
X = df[['title','description']]
y = df['first_genre']

In [6]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) | set(stopwords.words('french')) | set(stopwords.words('italian'))

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\artem.kuzmin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\artem.kuzmin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\artem.kuzmin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\artem.kuzmin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\artem.kuzmin\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\artem.kuzmin\AppData\Roaming\nltk_data...


True

In [7]:
import re

def clean_text(text):
    text = re.sub(r"[^\w\s]", "", text, re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    text = ' '.join(text.split())
    return text

In [8]:
titles = X['title']
descriptions = X['description']

In [9]:
X['cleaned_titles'] = X['title'].apply(lambda x: clean_text(x))
X['cleaned_descriptions'] = X['description'].apply(lambda x: clean_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['cleaned_titles'] = X['title'].apply(lambda x: clean_text(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['cleaned_descriptions'] = X['description'].apply(lambda x: clean_text(x))


In [10]:
X['tokenized_titles'] = X['cleaned_titles'].apply(lambda x: x.split())
X['tokenized_descriptions'] = X['cleaned_descriptions'].apply(lambda x: x.split())

In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [12]:
X['target_encoded'] = y_encoded

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X['cleaned_descriptions'], y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [14]:
# create TF-IDF features
tfidf_vectorizer = TfidfVectorizer()

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [15]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(multi_class='multinomial',max_iter=1000).fit(X_train_tfidf, y_train)

In [16]:
y_pred = lr.predict(X_test_tfidf)
y_pred_probs = lr.predict_proba(X_test_tfidf)

In [17]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, f1_score
roc_auc = roc_auc_score(y_test, y_pred_probs, multi_class="ovr", average="weighted")
acc = accuracy_score(y_test,y_pred)
prec = precision_score(y_test,y_pred,average="weighted")
f1 = f1_score(y_test,y_pred,average="weighted")

  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
print(roc_auc)
print(acc)
print(prec)
print(f1)

0.8190857936494454
0.32505910165484636
0.32353067678425407
0.3037537516707412


In [20]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier().fit(X_train_tfidf, y_train)

In [29]:
y_pred = dt.predict(X_test_tfidf)
y_pred_probs = dt.predict_proba(X_test_tfidf)

In [30]:
roc_auc = roc_auc_score(y_test, y_pred_probs, multi_class="ovr", average="weighted")
acc = accuracy_score(y_test,y_pred)
prec = precision_score(y_test,y_pred,average="weighted")
f1 = f1_score(y_test,y_pred,average="weighted")
print(roc_auc)
print(acc)
print(prec)
print(f1)

0.5686278285395313
0.17612293144208038
0.18354734510285056
0.17652615275984765


In [42]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()#.fit(X_train_tfidf, y_train)
rf_params = {"n_estimators":[50,100,300,500],
             "max_depth":[3,5,7,15,20],
             "max_features":[2,4,6,8,10,20],
             "min_samples_split":[2,4,6,8,15,20]}

In [43]:
from sklearn.model_selection import GridSearchCV
rf_cv_model = GridSearchCV(rf, rf_params, cv=5, n_jobs=-1, verbose=2).fit(X_train_tfidf,y_train)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


In [44]:
rf_cv_model.best_params_

{'max_depth': 20,
 'max_features': 20,
 'min_samples_split': 20,
 'n_estimators': 500}

In [54]:
rf_tuned = RandomForestClassifier(max_depth=50,max_features=20,min_samples_split=5,n_estimators=1000).fit(X_train_tfidf, y_train)

In [55]:
y_pred = rf_tuned.predict(X_test_tfidf)
y_pred_probs = rf_tuned.predict_proba(X_test_tfidf)

In [56]:
roc_auc = roc_auc_score(y_test, y_pred_probs, multi_class="ovr", average="weighted")
acc = accuracy_score(y_test,y_pred)
prec = precision_score(y_test,y_pred,average="weighted")
f1 = f1_score(y_test,y_pred,average="weighted")
print(roc_auc)
print(acc)
print(prec)
print(f1)

0.8042654738414443
0.3073286052009456
0.2862683793682286
0.27855233569460525


  _warn_prf(average, modifier, msg_start, len(result))


PyTorch Neural Network

In [95]:
class MovieDataset(torch.utils.data.Dataset):
    def __init__(self,n_rows=None):
        mat = coo_matrix(X_train_tfidf)
        values = mat.data
        indices = np.vstack((mat.row, mat.col))

        i = torch.LongTensor(indices)
        v = torch.FloatTensor(values)
        shape = mat.shape

        self.x_data = torch.sparse.FloatTensor(i, v, torch.Size(shape)).to_dense().to(device)
        self.y_data = torch.Tensor(y_train).to(device)
        
    def __len__(self):
        return len(self.x_data)
    
    def __getitem__(self,idx):
        preds = self.x_data[idx]
        targets = self.y_data[idx]
        sample = {
            'predictors' : preds,
            'targets': targets
        }
        return sample

In [96]:
class MyLogisticRegression(torch.nn.Module):
    def __init__(self):
        super(MyLogisticRegression,self).__init__()
        input_size = t.shape[1]
        output_size = len(np.unique(y_encoded))
        self.hid1 = torch.nn.Linear(input_size,1000)
        self.hid2 = torch.nn.Linear(1000,1000)
        self.output = torch.nn.Linear(1000,output_size)
        
        torch.nn.init.xavier_uniform_(self.hid1.weight)
        torch.nn.init.zeros_(self.hid1.bias)
        torch.nn.init.xavier_uniform_(self.hid2.weight)
        torch.nn.init.zeros_(self.hid2.bias)
        torch.nn.init.xavier_uniform_(self.output.weight)
        torch.nn.init.zeros_(self.output.bias)
    
    def forward(self,x):
        z = torch.tanh(self.hid1(x))
        z = torch.tanh(self.hid2(z))
        z = self.output(z)
        return z

In [97]:
train_df = MovieDataset()
batch_size = 32
train_ldr = torch.utils.data.DataLoader(train_df,batch_size=batch_size,shuffle=True)
model = MyLogisticRegression().to(device)
model.train()

MyLogisticRegression(
  (hid1): Linear(in_features=12908, out_features=1000, bias=True)
  (hid2): Linear(in_features=1000, out_features=1000, bias=True)
  (output): Linear(in_features=1000, out_features=24, bias=True)
)

In [98]:
l_rate = 0.01
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=l_rate)

In [99]:
for epoch in range(0,100):
    epoch_loss = 0.0
    
    for (batch_idx,batch) in enumerate(train_ldr):
        X = batch['predictors']
        Y = batch['targets'].type(torch.LongTensor).to(device)
        
        optimizer.zero_grad()
        output = model(X)
        
        loss_val = loss(output,Y)
        epoch_loss += loss_val.item()
        loss_val.backward()
        optimizer.step()
    
    if (epoch%10==0):
        print("epoch = %4d   loss = %0.4f" % (epoch, epoch_loss))

epoch =    0   loss = 385.5736
epoch =   10   loss = 35.7186
epoch =   20   loss = 32.2070
epoch =   30   loss = 35.2560
epoch =   40   loss = 29.1059
epoch =   50   loss = 28.3657
epoch =   60   loss = 39.6487
epoch =   70   loss = 30.9993
epoch =   80   loss = 26.4739
epoch =   90   loss = 32.8540


In [100]:
mat = coo_matrix(X_test_tfidf)
values = mat.data
indices = np.vstack((mat.row, mat.col))

i = torch.LongTensor(indices)
v = torch.FloatTensor(values)
shape = mat.shape

X_test_data = torch.sparse.FloatTensor(i, v, torch.Size(shape)).to_dense().to(device)

In [103]:
y_pred = model(X_test_data)

In [104]:
y_pred = [torch.argmax(y_item).item() for y_item in y_pred]

In [108]:
#roc_auc = roc_auc_score(y_test, y_pred_probs, multi_class="ovr", average="weighted")
acc = accuracy_score(y_test,y_pred)
prec = precision_score(y_test,y_pred,average="weighted")
f1 = f1_score(y_test,y_pred,average="weighted")

In [109]:
print(acc)
print(prec)
print(f1)

0.19976359338061467
0.21129318547531345
0.1968468552155207
