In [1]:
train_path = r"E:\CHANDRU\chand proj\MOUD\VideoReviews\transcriptions\*.csv"

In [2]:
import glob
import numpy as np
import pandas as pd
import sklearn
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.DataFrame()
for f in glob.glob(train_path):
    df = df.append(pd.read_csv(f,sep=';'),ignore_index=True)

In [4]:
df.head()

Unnamed: 0,#starttime,#endtime,transcription,sentimentAnnotations,Speech,speech,sentimentAnnotation,sentimentannotations
0,0.0,3.642,yo habia visto resenas que decian que picaba c...,-1.0,,,,
1,3.642,9.552,y la verdad es que si la use una vez y t- y te...,-1.0,,,,
2,9.552,14.197,y dije no: puede ser posible tanto la deseaba ...,-1.0,,,,
3,14.197,20.545,esta tambien tira un poquito de pelo pero haga...,-1.0,,,,
4,20.545,23.275,pero igual con las lavadas se ha dejado de tir...,1.0,,,,


In [5]:
# funcion to append all utterances to dataframe
def create_data_df(df_name,data_path):
    
    # Creating dataframe of entire transcriptions
    for f in glob.glob(data_path):
        df_name = df_name.append(pd.read_csv(f,sep=';'),ignore_index=True)
    
    # combine multiple speech, annotation columns to one and drop rest of columns
    if 'Speech' not in df_name.columns:
        df_name['Speech'] = ''    
    if 'speech' in df_name.columns:
        df_name['Speech'] = df_name[['Speech','speech']].fillna('').sum(axis=1)   
    if 'transcription' in df_name.columns:
        df_name['Speech'] = df_name[['Speech','transcription']].fillna('').sum(axis=1)
    
    if 'sentimentAnnotation' not in df_name.columns:
        df_name['sentimentAnnotation'] = 0    
    if 'sentimentAnnotations' in df_name.columns:
        df_name['sentimentAnnotation'] = df_name[['sentimentAnnotation','sentimentAnnotations']].fillna(0).sum(axis=1)
    if 'sentimentannotations' in df_name.columns:
        df_name['sentimentAnnotation'] = df_name[['sentimentAnnotation','sentimentannotations']].fillna(0).sum(axis=1)
    
    # Remove neutral annotations
    df_name = df_name.query('sentimentAnnotation != 0')
    
    df_name = df_name[['Speech','sentimentAnnotation']].reset_index(drop=True)  
    return df_name

In [6]:
df = pd.DataFrame()
df = create_data_df(df,train_path)
df.head()

Unnamed: 0,Speech,sentimentAnnotation
0,yo habia visto resenas que decian que picaba c...,-1.0
1,y la verdad es que si la use una vez y t- y te...,-1.0
2,y dije no: puede ser posible tanto la deseaba ...,-1.0
3,esta tambien tira un poquito de pelo pero haga...,-1.0
4,pero igual con las lavadas se ha dejado de tir...,1.0


In [7]:
print(df.shape)
df.shape

(450, 2)


(450, 2)

In [8]:
import re
from bs4 import BeautifulSoup
import nltk


In [9]:
from nltk.corpus import stopwords

# resuable function to convert raw speech to preprocessed
def utterance_to_words(raw_utterance):
    # 1. Removing HTML elements from text
    utterance_text = BeautifulSoup(raw_utterance,"lxml").get_text()
    # TRANSLATION
    #translated_utterance = translator.translate(utterance_text,from_lang="spanish",to_lang = 'english')
    # 2. Keeping only letters
    letters_only = re.sub("[^a-zA-Z]", " ", utterance_text) 
    # 3. Converting to lower case and splitting into individual words
    lowercase_words = letters_only.lower().split()
    # 4. converting the stop words to a set to help faster execution
    spanish_stops = set(stopwords.words("english"))
    # 5. Removing stop words from the text
    meaningful_words = [w for w in lowercase_words if not w in spanish_stops]
    # 6. Join the words back into one string separated by space, and return the result.
    return( " ".join( meaningful_words ))
    
# applying the function to the speech columns
df['Speech'] = df['Speech'].apply(lambda x: utterance_to_words(x))
df.head()

Unnamed: 0,Speech,sentimentAnnotation
0,yo habia visto resenas que decian que picaba c...,-1.0
1,la verdad es que si la use una vez te arde asi...,-1.0
2,dije puede ser posible tanto la deseaba arde l...,-1.0
3,esta tambien tira un poquito de pelo pero haga...,-1.0
4,pero igual con las lavadas se ha dejado de tirar,1.0


In [10]:
def translate(text):
    from googletrans import Translator
    translator = Translator()
    translated_text = translator.translate(text, dest='en').text
    return translated_text

In [16]:
t_tags=[]
for i in df['Speech']:
    s=translate(i)
    t_tags.append(s)

In [17]:
df['Review']=t_tags

In [18]:
X, Y = df[['Review']],df[['sentimentAnnotation']]

In [19]:
print(X)
Y

                                                Review
0    I had seen penetrates that they said when you ...
1    The truth is that if I use it once you burns y...
2    I said it can be possible so much I wanted to ...
3    This also throws a little hair but they realiz...
4    But the same with the washes has stopped throwing
..                                                 ...
445  Already in other LIs in other videos already t...
446  If they are curious if they have caught their ...
447  Be the first time that I saw them were so parents
448  Here I am five books after being super fan and...
449  sometimes he reads this book I felt like Cassa...

[450 rows x 1 columns]


Unnamed: 0,sentimentAnnotation
0,-1.0
1,-1.0
2,-1.0
3,-1.0
4,1.0
...,...
445,1.0
446,1.0
447,-1.0
448,1.0


In [20]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=41)
print('x_train:',x_train.shape)
print('y_train:',y_train.shape)
print('x_test:',x_test.shape)
print('y_test:',y_test.shape)


x_train: (360, 1)
y_train: (360, 1)
x_test: (90, 1)
y_test: (90, 1)


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vector = TfidfVectorizer()
X_train_t = tf_vector.fit_transform(x_train['Review'])
X_train_t

<360x1011 sparse matrix of type '<class 'numpy.float64'>'
	with 4128 stored elements in Compressed Sparse Row format>

In [22]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# def create_tfidf_vectors(train_reviews):
#     tf_vector = TfidfVectorizer()
#     tfidf_vectors = tf_vector.fit_transform(train_reviews)
#     return tfidf_vectors

In [23]:
X_test_tf = tf_vector.transform(x_test['Review'])
X_test_tf

<90x1011 sparse matrix of type '<class 'numpy.float64'>'
	with 925 stored elements in Compressed Sparse Row format>

In [24]:
from sklearn.linear_model import LogisticRegression
models = LogisticRegression()
models.fit(X_train_t,y_train)


In [25]:
from sklearn.metrics import accuracy_score
l_train_score = models.predict(X_train_t)
l_train_accuracy = accuracy_score(l_train_score,y_train)
print('train_accuracy:',l_train_accuracy)

train_accuracy: 0.9138888888888889


In [26]:
l_test_score = models.predict(X_test_tf)
l_test_accuracy = accuracy_score(l_test_score,y_test) 
print('test_acccuracy:',l_test_accuracy)

test_acccuracy: 0.7


In [27]:
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix
cmx_1=confusion_matrix(y_test,l_test_score)
print("\nNo. of test samples : ",len(x_test))
print("\n Confustion Matrix : \n",cmx_1)
print("\nPerfomance measures are: \n",classification_report(y_test, l_test_score))


No. of test samples :  90

 Confustion Matrix : 
 [[43  6]
 [21 20]]

Perfomance measures are: 
               precision    recall  f1-score   support

        -1.0       0.67      0.88      0.76        49
         1.0       0.77      0.49      0.60        41

    accuracy                           0.70        90
   macro avg       0.72      0.68      0.68        90
weighted avg       0.72      0.70      0.69        90



In [39]:
from sklearn import svm
support = svm.SVC()
support.fit(X_train_t,y_train)
train_score_1 = support.predict(X_train_t)
train_accuracy_1 = accuracy_score(train_score_1,y_train)


In [40]:
test_score_1 = support.predict(X_test_tf)
test_accuracy_1 = accuracy_score(test_score_1,y_test)


In [41]:
cmx_2=confusion_matrix(test_score_1,y_test)
print("\nNo. of test samples : ",len(x_test))
print("\n Confustion Matrix : \n",cmx_2)
print("\nPerfomance measures are: \n",classification_report(test_score_1,y_test))


No. of test samples :  90

 Confustion Matrix : 
 [[44 19]
 [ 5 22]]

Perfomance measures are: 
               precision    recall  f1-score   support

        -1.0       0.90      0.70      0.79        63
         1.0       0.54      0.81      0.65        27

    accuracy                           0.73        90
   macro avg       0.72      0.76      0.72        90
weighted avg       0.79      0.73      0.74        90



In [42]:
dft = df.loc[4,'Review']
print(dft)

But the same with the washes has stopped throwing


In [43]:
data={"review":["The truth is that if I use it once you burns you where you use it burns your eye",
                "But the same with the washes has stopped throwing"]}
data=pd.DataFrame(data)
new=tf_vector.transform(data['review'])

pred=support.predict(new)
pred

array([-1.,  1.])

In [44]:
data1={"review":["Go for it. Fabric is good. But have small doubt if it will gets shrink after washing. Overall it's good. Though the dupatta is shorter."
                ,"I want to returned it because of size issue but there is no option. Disappointed One person found this helpful"]} 
data1=pd.DataFrame(data1)
new=tf_vector.transform(data1['review'])

pred1=support.predict(new)
pred1

array([ 1., -1.])

In [45]:
import pickle
# Save the model using pickle.dump()
with open('support.pkl', 'wb') as file:
    pickle.dump(support, file)

In [46]:
import pickle
# Save the model using pickle.dump()
with open('tf_vector.pkl', 'wb') as file:
    pickle.dump(tf_vector, file)

In [47]:
file.close()

In [48]:
# Load the saved model using pickle.load()
with open('support.pkl', 'rb') as file:
    support= pickle.load(file)

In [49]:

data2={"review":["The truth is that if I use it once you burns you where you use it burns your eye",
                "But the same with the washes has stopped throwing"]}
data2=pd.DataFrame(data2)
new2=tf_vector.transform(data2['review'])

pred2=support.predict(new2)
pred2


array([-1.,  1.])