In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import random
import re
import string
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix

import gensim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [190]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) | set(stopwords.words('french')) | set(stopwords.words('italian'))

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\artem.kuzmin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\artem.kuzmin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\artem.kuzmin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\artem.kuzmin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\artem.kuzmin\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\artem.kuzmin\AppData\Roaming\nltk_data...


True

In [None]:
w2vec = gensim.models.KeyedVectors.load_word2vec_format(
    "GoogleNews-vectors-negative300.bin.gz", binary=True
)

In [None]:
df = pd.read_csv('each_genre200.csv')

In [None]:
def clean_text(text):
    text = re.sub(r"[^\w\s]", "", text, re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    text = ' '.join(text.split())
    return text

In [None]:
def create_avg_w2v_vectors(sentence):
    avg_vector = np.zeros(300)
    for word in sentence:
        if (word in w2vec):
            avg_vector += w2vec[word]
    if (len(sentence)==0):
        return avg_vector
    return avg_vector/len(sentence)

In [None]:
def extract_XY(df):
    X = df[['title','description']]
    y = df['first_genre']
    X['cleaned_titles'] = X['title'].apply(lambda x: clean_text(x))
    X['cleaned_descriptions'] = X['description'].apply(lambda x: clean_text(x))
    X['tokenized_titles'] = X['cleaned_titles'].apply(lambda x: x.split())
    X['tokenized_descriptions'] = X['cleaned_descriptions'].apply(lambda x: x.split())
    X['w2v_avg_titles'] = X['tokenized_titles'].apply(lambda line: create_avg_w2v_vectors(line))
    X['w2v_avg_descriptions'] = X['tokenized_descriptions'].apply(lambda line: create_avg_w2v_vectors(line))
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    return X,y_encoded

In [192]:
def train_model(X_train,y_train):
    model = SVC(kernel='rbf',probability=True)
    svc_params = {"C": list(np.arange(0.1,2.5,0.1))}

    model_cv = GridSearchCV(model, svc_params, cv=3, n_jobs=-1,verbose=2).fit(X_train,y_train)
    model = SVC(kernel='rbf',C=model_cv.best_params_['C'],probability=True).fit(X_train,y_train)
    
    return model

def predict(model,X_test,y_test):
    y_pred = model.predict(X_test)
    y_pred_probs = model.predict_proba(X_test)
    
    roc_auc = roc_auc_score(y_test, y_pred_probs, multi_class="ovr", average="weighted")
    acc = accuracy_score(y_test,y_pred)
    prec = precision_score(y_test,y_pred,average="weighted")
    f1 = f1_score(y_test,y_pred,average="weighted")
    print(f"ROC_AUC:{roc_auc}")
    print(f"Accuracy:{acc}")
    print(f"Precision:{prec}")
    print(f"F1-score:{f1}")
    
    return y_pred,y_pred_probs

In [194]:
X,y = extract_XY(df)
X_train_titles, X_test_titles, y_train_titles, y_test_titles = train_test_split(X[f'w2v_avg_titles'], y, test_size=0.2, random_state=42, stratify=y)
X_train_titles, X_test_titles = list(X_train_titles), list(X_test_titles)

X_train_descriptions, X_test_descriptions, y_train_descriptions, y_test_descriptions = train_test_split(X[f'w2v_avg_descriptions'], y, test_size=0.2, random_state=42, stratify=y)
X_train_descriptions, X_test_descriptions = list(X_train_descriptions), list(X_test_descriptions)

model_titles = train_model(X_train_titles,y_train_titles)
y_pred_titles,_ = predict(model_titles,X_test_titles,y_test_titles)

model_descriptions = train_model(X_train_descriptions,y_train_descriptions)
y_pred_descriptions,_ = predict(model_descriptions,X_test_descriptions,y_test_descriptions)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['cleaned_titles'] = X['title'].apply(lambda x: clean_text(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['cleaned_descriptions'] = X['description'].apply(lambda x: clean_text(x))


Fitting 3 folds for each of 24 candidates, totalling 72 fits


  _warn_prf(average, modifier, msg_start, len(result))


ROC_AUC:0.6894792203549827
Accuracy:0.18203309692671396
Precision:0.1811324192795199
F1-score:0.16923150841832893
Fitting 3 folds for each of 24 candidates, totalling 72 fits
ROC_AUC:0.8583927840425705
Accuracy:0.3416075650118203
Precision:0.3207744821095317
F1-score:0.32746349667036495


  _warn_prf(average, modifier, msg_start, len(result))
