# Environment setup

In [4]:
import pandas as pd
from datetime import datetime
import spacy
import spacy_transformers
from spacy.tokens import DocBin

# Dataset

## Enti

In [None]:
# Reading the dataset
df = pd.read_csv("../data/processed/Gold_Standard.csv")
#Mapping GoldStandard scores to sentiment classes
df.loc[df['Class'] =='pos', 'Pos_GS'] = 'yes'
df.loc[df['Class'] == 'neg', 'Neg_GS'] = 'yes'
df.loc[df['Class'] == 'neut', 'Neut_GS'] = 'yes'
df.loc[df['Class'] == 'mix', 'Pos_GS'] = 'yes'
df.loc[df['Class'] == 'mix', 'Neg_GS'] = 'yes'
df = df.fillna("no")

df.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,year,month,day,tweetOrig,tweet_x,Final_Class,Class,Irony,Pos_GS,Neg_GS,Neut_GS
0,0,1462002288835403777,2021,11,20,In arrivo un nuovo #bonus #inps! Scopri chi pu...,in arrivo un nuovo bonus inps scopri chi pu√≤ o...,pos,pos,no,yes,no,no
1,1,1354381987507744771,2021,1,27,"Allora, riepiloghiamo;\nAi politici la pension...",allora riepiloghiamo ai politici la pensione d...,neg_ir,neg,yes,no,yes,no
2,2,1454050817821003783,2021,10,29,Caro @INPS_it e cari @Europarl_IT fate bene i...,caro e cari fate bene i vostri conti perch√© no...,neg_ir,neg,yes,no,yes,no
3,3,1393675898960982016,2021,5,15,"FOTO - A #napoli, dopo l'apertura di una #vora...",foto a napoli dopo l apertura di una voragine ...,neut,neut,no,no,no,yes
4,4,1417876270705164289,2021,7,21,Maxi esercitazione di #protezionecivile. Lo sc...,maxi esercitazione di protezionecivile lo scen...,neut,neut,no,no,no,yes


In [None]:
df['tweet_x'][0]

'in arrivo un nuovo bonus inps scopri chi pu√≤ ottenere fino a euro le domande devono essere presentate entro il dicembre üòÅ'

In [None]:
df.shape

(1900, 13)

In [None]:
#Train-test splitting
train = df.sample(frac = 0.8, random_state = 25)
test = df.drop(train.index)

In [None]:
# Checking the shape
print(train.shape, test.shape)

(1520, 13) (380, 13)


## SentiPolc

In [None]:
sentiTrain =  pd.read_csv("../data/processed/SentiPolc/training_set_sentipolc16.csv")
sentiTrain.loc[sentiTrain['lpos'] ==1, 'Pos_GS'] = 'yes'
sentiTrain.loc[sentiTrain['lneg'] ==1, 'Neg_GS'] = 'yes'
sentiTrain.loc[(sentiTrain['lpos'] ==0) & (sentiTrain['lneg'] ==0), 'Neut_GS'] = 'yes'
sentiTrain = sentiTrain.fillna("no")

In [None]:
colnames=['subj','opos','oneg','iro','lpos','lneg','top','text'] 
sentiTest = pd.read_csv("../data/processed/SentiPolc/test_set_sentipolc16_gold2000.csv",header=None,names=colnames)
sentiTest.loc[sentiTest['lpos'] ==1, 'Pos_GS'] = 'yes'
sentiTest.loc[sentiTest['lneg'] ==1, 'Neg_GS'] = 'yes'
sentiTest.loc[(sentiTest['lpos'] ==0) & (sentiTest['lneg'] ==0), 'Neut_GS'] = 'yes'
sentiTest = sentiTest.fillna("no")

In [None]:
import spacy
nlp = spacy.load("it_core_news_lg")
nlp.pipe_names

['tok2vec',
 'morphologizer',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

## Spacy input format conversion

In [None]:
def document(data):
  text = []
  for doc, label in nlp.pipe(data, as_tuples = True):
    if (label=='yes'):
      doc.cats['yes'] = 1
      doc.cats['no'] = 0
    elif (label=='no'):
      doc.cats['yes'] = 0
      doc.cats['no'] = 1
    text.append(doc)
  return(text)

In [None]:
def convertToSpacy (sentiment,train,test):
    if sentiment=='pos':
        train['tuples'] = train.apply(lambda row: (row['tweetOrig'],row['Pos_GS']), axis=1)
        train = train['tuples'].tolist()
        test['tuples'] = test.apply(lambda row: (row['tweetOrig'],row['Pos_GS']), axis=1)
        test = test['tuples'].tolist()
    elif sentiment=='neg':
        train['tuples'] = train.apply(lambda row: (row['tweetOrig'],row['Neg_GS']), axis=1)
        train = train['tuples'].tolist()
        test['tuples'] = test.apply(lambda row: (row['tweetOrig'],row['Neg_GS']), axis=1)
        test = test['tuples'].tolist()
    elif sentiment=='neut':
        train['tuples'] = train.apply(lambda row: (row['tweetOrig'],row['Neut_GS']), axis=1)
        train = train['tuples'].tolist()
        test['tuples'] = test.apply(lambda row: (row['tweetOrig'],row['Neut_GS']), axis=1)
        test = test['tuples'].tolist()
    #Time for converting into binary document for train dataset
    start_time = datetime.now()
    #Passing the train dataset into function 'document'
    train_docs = document(train)
    #Creating binary document using DocBin function in spaCy
    doc_bin = DocBin(docs = train_docs)
    #Saving the binary document as train.spacy
    doc_bin.to_disk("../data/processed/Enti/train_"+sentiment+".spacy")
    #Time duration for train dataset
    end_time = datetime.now()
    print('Training dataset creation: {}'.format(end_time - start_time))
    #Time for converting into binary document for test dataset
    start_time = datetime.now()
    #passing the test dataset into function 'document'
    test_docs = document(test)
    doc_bin = DocBin(docs = test_docs)
    doc_bin.to_disk("../data/processed/valid_"+sentiment+".spacy")
    #Printing the time duration for test dataset
    end_time = datetime.now()
    print('Test dataset creation: {}'.format(end_time - start_time))
    

In [None]:
convertToSpacy('neut',train,test)

Training dataset creation: 0:00:05.566588
Test dataset creation: 0:00:01.269435


# Spacy config file

In [None]:
#Converting base configuration into full config file

!python -m spacy init fill-config ./base_config.cfg ./config.cfg

# Spacy training

In [None]:
start_time = datetime.now()
 
!python -m spacy train "../models/configs/config.cfg" --verbose  --output "../models/Models/BERT-xxl/Cross/Positive" --gpu-id 0

end_time = datetime.now()

print('Duration: {}'.format(end_time - start_time))