# Loading of the training set.

We peek the datset using the function head() to see its composition.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("C:/Users/zucco/OneDrive - Politecnico di Milano/Desktop/NLP_task/subtask_1/en/train.tsv.gz",sep="\t",header=0)
pd.set_option('display.max_colwidth', 150)
df.head()

Unnamed: 0,id,text,label
0,12322,you need to stop the engine and wait until it stops. This is how I would do it: // Check if its safe,generated
1,1682,"The Commission shall publish the report; an interim report at least once every two years, and whenever it considers that such a report is necessar...",generated
2,22592,"I have not been tweeting a lot lately, but I did in November, and it was a really good month. I also",generated
3,17390,I pass my exam and really thankgod for that but idk where will I go for shsmy result is ah,human
4,30453,"The template will have 3 parts: a mustache shape, a bow tie shape, and a skinny rectangle. The mustache shape will eventually make the bow loops. ...",human


Word Embedding

# Classifying with word embeddings

In [3]:
from gensim import downloader as api
import string
import re
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt  
import numpy as np
from nlp_project.notebook_utils import evaluate, split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [4]:
df.sample(frac=1)
df['label'] = df['label'].replace({'A':0, 'B':1,'C':2,'D':3,'E':4,'F':5,})
labels=['A','B','C','D','E','F']


In [5]:

regex = '[' + string.punctuation + ']'
def vectorize(docs, embedding_model, useSum ,dim):
    vectors = np.zeros((len(docs),dim))
    for i in range(len(docs)):
        tokens = re.sub(regex, '', docs[i].lower()).split()
        embeddings = [embedding_model.get_vector(token) for token in tokens if token in embedding_model]
        if (len(embeddings) > 0):
            if (useSum): 
                vectors[i] = sum(embeddings)
            else:
                vectors[i] = np.mean(embeddings, axis=0)
    return vectors

In [6]:
def prep_dataset(word_embeds,dim,useSum):
    model = api.load(word_embeds)
    x_train = vectorize(np.array(df['text']),model,useSum,dim)
    y_train= np.array(df['label'])
    x_train, x_val, x_test, y_train, y_val, y_test = split(x_train, y_train, test_size=0.2, val_size=0.0)
    return model,x_train, x_test, y_train, y_test

In [7]:
 def setup_models():   
    models = []
    lr = LogisticRegression(max_iter=1000)
    lr_param = [{
        "solver": ["liblinear"], 
        "penalty": ["l1", "l2"],
        "C":[0.01, 0.1, 1, 10]
    },{
        "solver": ("lbfgs", "sag", "saga"), 
        "penalty": ["l2"],
        "C":[0.01, 0.1, 1]
    }]
    lr_clf = GridSearchCV(lr, lr_param, cv=5, scoring="f1_micro", verbose=1)
    models.append({"name": "Linear Regression", "model": lr_clf})

    # SVC
    svc = SVC()
    svc_param = {"kernel": ["rbf"], "C": [0.1, 1, 10]}
    svc_clf = GridSearchCV(svc, svc_param, cv=5, scoring="f1_micro", verbose=1)
    models.append({"name": "SVC", "model": svc_clf, "subsample": 0.7})

    # ExtraTreesClassifier
    et = ExtraTreesClassifier()
    et_param = {"n_estimators":[10, 50, 100, 200, 500, 1000]}
    et_clf = GridSearchCV(et, et_param, cv=5, scoring="f1_micro", verbose=1)
    models.append({"name": "ExtraTree", "model": et_clf})
    return models


In [8]:
def train_models(models,x_train,y_train):    
    for model in models:
        print(f"Training {model['name']}")
        x_train_, y_train_ = x_train, y_train
        if "subsample" in model.keys():
            x_train_, _, y_train_, _ = train_test_split(
                x_train, 
                y_train, 
                test_size=model["subsample"], 
                stratify=y_train
            )
        model["model"].fit(x_train_, y_train_)
        print("Found best model")
        model["best"] = model["model"].best_estimator_
        model["best"].fit(x_train, y_train)
        print("Trained best model")

In [9]:
def evaluate_models(models,x_test,y_test):
    for model in models:
        print(f"{model['name']}")
        print(f"Best parameters: {model['model'].best_params_}")
        print(f"Best CV score: {model['model'].best_score_}")
        y_pred = model['best'].predict(x_test)
        evaluate(y_test, y_pred, labels=labels)

Model tried:
- glove-wiki-gigaword-50
- word2vec-google-news-300
- conceptnet-numberbatch-17-06-300
- fasttext-wiki-news-subwords-300
- glove-twitter-100
- glove-twitter-200
- glove-twitter-25
- word2vec-ruscorpora-300

Best results with word2vec-google-news-300 with around 46 percent accuracy on SVC.

In [None]:
model,x_train, x_test, y_train, y_test=prep_dataset("word2vec-google-news-300",300,True)
models=setup_models()
train_models(models,x_train,y_train)
evaluate_models(models,x_test,y_test)

Training Linear Regression
Fitting 5 folds for each of 17 candidates, totalling 85 fits


# Building our word embedding

In [10]:
from pandas.core.common import flatten
from gensim.models.word2vec import Word2Vec

In [11]:
docs = [re.sub('\n', ' ', doc) for doc in df.text]
sentences = [re.split('[?!.]\s', doc) for doc in docs]
sentences = list(flatten(sentences))
sentences[:3]

['you need to stop the engine and wait until it stops',
 'This is how I would do it: // Check if its safe',
 'The Commission shall publish the report; an interim report at least once every two years, and whenever it considers that such a report is necessary or appropriate']

In [12]:
tokenized_sentences = [re.sub('\W', ' ', sentence).lower().split() for sentence in sentences]

In [13]:
regex = '[' + string.punctuation + ']'

def vectorize_embd(docs, embedding_model, useSum=False,dim=50):
 
    vectors = np.zeros((len(docs),dim))
    for i in range(len(docs)):
        tokens = re.sub(regex, '', docs[i].lower()).split()
        embeddings = [embedding_model.wv[token] for token in tokens if token in embedding_model.wv.key_to_index]
        
        if (len(embeddings) > 0):
            if (useSum): 
                vectors[i] = sum(embeddings)
            else:
                vectors[i] = np.mean(embeddings, axis=0)
    return vectors

In [14]:
def prep_dataset_word(dim):
    model = Word2Vec(tokenized_sentences, vector_size=dim, min_count=5, window=10)
    x_train = vectorize_embd(np.array(df['text']),model,True,dim)
    y_train= np.array(df['label'])
    x_train, x_val, x_test, y_train, y_val, y_test = split(x_train, y_train, test_size=0.2, val_size=0.0)
    return model,x_train, x_test, y_train, y_test

Model tried with deifferente dimensions:
- 25
- 50
- 100 
- 200 
- 300

Best results with 200 with around 43 percent accuracy on SVC.

In [None]:
model,x_train, x_test, y_train, y_test=prep_dataset_word(200)
models=setup_models()
train_models(models,x_train,y_train)
evaluate_models(models,x_test,y_test)