In [1]:
import glob
import numpy as np
import pandas as pd
import sklearn
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_path = r"D:\PROJECT\Datasets\MOUD\VideoReviews\transcriptions\*.csv"
df = pd.DataFrame()
for f in glob.glob(train_path):
    df = df.append(pd.read_csv(f,sep=';'),ignore_index=True)

In [3]:
df.head()

Unnamed: 0,#starttime,#endtime,transcription,sentimentAnnotations,Speech,speech,sentimentAnnotation,sentimentannotations
0,0.0,3.642,yo habia visto resenas que decian que picaba c...,-1.0,,,,
1,3.642,9.552,y la verdad es que si la use una vez y t- y te...,-1.0,,,,
2,9.552,14.197,y dije no: puede ser posible tanto la deseaba ...,-1.0,,,,
3,14.197,20.545,esta tambien tira un poquito de pelo pero haga...,-1.0,,,,
4,20.545,23.275,pero igual con las lavadas se ha dejado de tir...,1.0,,,,


In [4]:
df.isnull()

Unnamed: 0,#starttime,#endtime,transcription,sentimentAnnotations,Speech,speech,sentimentAnnotation,sentimentannotations
0,False,False,False,False,True,True,True,True
1,False,False,False,False,True,True,True,True
2,False,False,False,False,True,True,True,True
3,False,False,False,False,True,True,True,True
4,False,False,False,False,True,True,True,True
...,...,...,...,...,...,...,...,...
493,False,False,True,False,False,True,True,True
494,False,False,True,False,False,True,True,True
495,False,False,True,False,False,True,True,True
496,False,False,True,False,False,True,True,True


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498 entries, 0 to 497
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   #starttime            498 non-null    float64
 1   #endtime              498 non-null    float64
 2   transcription         64 non-null     object 
 3   sentimentAnnotations  473 non-null    float64
 4   Speech                392 non-null    object 
 5   speech                42 non-null     object 
 6   sentimentAnnotation   21 non-null     float64
 7   sentimentannotations  4 non-null      float64
dtypes: float64(5), object(3)
memory usage: 31.2+ KB


In [6]:
# funcion to append all utterances to dataframe
def create_data_df(df_name,data_path):
    
    # Creating dataframe of entire transcriptions
    for f in glob.glob(data_path):
        df_name = df_name.append(pd.read_csv(f,sep=';'),ignore_index=True)    
    # combine multiple speech, annotation columns to one and drop rest of columns
    if 'Speech' not in df_name.columns:
        df_name['Speech'] = ''    
    if 'speech' in df_name.columns:
        df_name['Speech'] = df_name[['Speech','speech']].fillna('').sum(axis=1)   
    if 'transcription' in df_name.columns:
        df_name['Speech'] = df_name[['Speech','transcription']].fillna('').sum(axis=1)
    
    if 'sentimentAnnotation' not in df_name.columns:
        df_name['sentimentAnnotation'] = 0    
    if 'sentimentAnnotations' in df_name.columns:
        df_name['sentimentAnnotation'] = df_name[['sentimentAnnotation','sentimentAnnotations']].fillna(0).sum(axis=1)
    if 'sentimentannotations' in df_name.columns:
        df_name['sentimentAnnotation'] = df_name[['sentimentAnnotation','sentimentannotations']].fillna(0).sum(axis=1)
    
    # Remove neutral annotations
    #df_name = df_name.query('sentimentAnnotation != 0')
    
    df_name = df_name[['Speech','sentimentAnnotation']].reset_index(drop=True)  
    return df_name

In [7]:
df = pd.DataFrame()
df = create_data_df(df,train_path)
df

Unnamed: 0,Speech,sentimentAnnotation
0,yo habia visto resenas que decian que picaba c...,-1.0
1,y la verdad es que si la use una vez y t- y te...,-1.0
2,y dije no: puede ser posible tanto la deseaba ...,-1.0
3,esta tambien tira un poquito de pelo pero haga...,-1.0
4,pero igual con las lavadas se ha dejado de tir...,1.0
...,...,...
493,Ya en otros li- en otros videos ya les he dich...,1.0
494,"Si tienen curiosidad, o si les ha llamado la a...",1.0
495,"O sea, / yo la primera vez que los vi no se me...",-1.0
496,"Y aqui estoy, cinco libros despues, siendo sup...",1.0


In [8]:
df.sample(5)

Unnamed: 0,Speech,sentimentAnnotation
417,// Asi que no me gusta.,-1.0
289,"Y bueno, si Rubi me gusto, Zafiro me gusto mas...",1.0
166,no lo puedes difuminar // para nada. //,-1.0
150,en fin. Es-,0.0
470,Lo que yo pienso. Lo que yo opino es // que to...,1.0


In [9]:
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
import translators as ts

Using state Telangana server backend.


In [10]:
def utterance_to_words(raw_utterance):
    # 1. Removing HTML elements from text
    utterance_text = BeautifulSoup(raw_utterance,"lxml").get_text()
    # 2. Keeping only letters
    letters_only = re.sub("[^a-zA-Z]", " ", utterance_text) 
    # 3. Converting to lower case and splitting into individual words
    lowercase_words = letters_only.lower().split()
    # 4. converting the stop words to a set to help faster execution
    spanish_stops = set(stopwords.words("english"))
    # 5. Removing stop words from the text
    meaningful_words = [w for w in lowercase_words if not w in spanish_stops]
    # 6. Join the words back into one string separated by space, and return the result.
    return( " ".join( meaningful_words ))
    
# applying the function to the speech columns
df['Speech'] = df['Speech'].apply(lambda x: utterance_to_words(x))
df.head()

Unnamed: 0,Speech,sentimentAnnotation
0,yo habia visto resenas que decian que picaba c...,-1.0
1,la verdad es que si la use una vez te arde asi...,-1.0
2,dije puede ser posible tanto la deseaba arde l...,-1.0
3,esta tambien tira un poquito de pelo pero haga...,-1.0
4,pero igual con las lavadas se ha dejado de tirar,1.0


In [11]:
## TRANSLATION Spanish into English
df["Speech"]=df["Speech"].apply(lambda x: ts.google(x,from_language="es",to_language="en"))
df.head()


Unnamed: 0,Speech,sentimentAnnotation
0,I had seen penetrates that they said when you ...,-1.0
1,The truth is that if I use it once you burns y...,-1.0
2,I said it can be possible so much I wanted to ...,-1.0
3,This also throws a little hair but they realiz...,-1.0
4,But the same with the washes has stopped throwing,1.0


In [12]:
X_trn, y_trn = df[['Speech']],df[['sentimentAnnotation']]

# countVectorizer initialization
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             lowercase = True,    \
                             max_features = 5000) 

# create bag of words vector for the training set using countVectorizer
train_data_features = vectorizer.fit_transform(X_trn['Speech'].values)

# tf-idf transformer initialization
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

# create tfidf transformed vector  for the training set using tf-idf transformer
X_train_tfidf = tfidf_transformer.fit_transform(train_data_features)

In [13]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_train_tfidf,y_trn, test_size=0.3, random_state=10)

In [14]:
print("X_train shape",x_train.shape)
print("x_test shape",x_test.shape)
print("y_train",y_train.shape)
print("y_test",y_test.shape)

X_train shape (348, 1257)
x_test shape (150, 1257)
y_train (348, 1)
y_test (150, 1)


In [15]:
from sklearn import svm
model_tf = svm.SVC().fit(x_train,y_train['sentimentAnnotation'].values)

# generate predictions
predicted_tf = model_tf.predict(x_test)

# Classification report
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix
print(classification_report(y_test['sentimentAnnotation'].values, predicted_tf))

              precision    recall  f1-score   support

        -1.0       0.62      0.88      0.73        78
         0.0       0.00      0.00      0.00        12
         1.0       0.74      0.47      0.57        60

    accuracy                           0.65       150
   macro avg       0.45      0.45      0.43       150
weighted avg       0.62      0.65      0.61       150



In [16]:
print("SVM accuracy score: ",accuracy_score(y_test,predicted_tf))
# Confusion Matrix
confusion_matrix(y_test,predicted_tf)

SVM accuracy score:  0.6466666666666666


array([[69,  0,  9],
       [11,  0,  1],
       [32,  0, 28]], dtype=int64)

In [17]:
from sklearn.linear_model import LogisticRegression
model_tf = LogisticRegression().fit(x_train,y_train['sentimentAnnotation'].values)

# generate predictions
predicted_tf = model_tf.predict(x_test)
print("  LogisticRegression accuracy score: ",accuracy_score(y_test,predicted_tf))

  LogisticRegression accuracy score:  0.6533333333333333


In [18]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(x_train,y_train['sentimentAnnotation'].values)
predicted_tf = model_tf.predict(x_test)
print("KNeighborsClassifier accuracy score: ",accuracy_score(y_test,predicted_tf))

KNeighborsClassifier accuracy score:  0.6533333333333333
