In [32]:
import pandas as pd
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import contractions
import spacy
nlp = spacy.load("en_core_web_sm")
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('YoutubeCommentsDataSet.csv')

In [3]:
df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


In [4]:
df.Sentiment.value_counts()

Sentiment
positive    11432
neutral      4638
negative     2338
Name: count, dtype: int64

In [5]:
df.describe()

Unnamed: 0,Comment,Sentiment
count,18364,18408
unique,17871,3
top,one of the best thing about dude is that he ne...,positive
freq,10,11432


In [6]:
df.isna().sum()

Comment      44
Sentiment     0
dtype: int64

In [7]:
df2 = df.loc[~df.Comment.isna()]

In [8]:
df2.isna().sum()

Comment      0
Sentiment    0
dtype: int64

In [9]:
lemmatizer = WordNetLemmatizer()

In [10]:
df2.Comment[0]

'lets not forget that apple pay in 2014 required a brand new iphone in order to use it a significant portion of apples user base wasnt able to use it even if they wanted to as each successive iphone incorporated the technology and older iphones were replaced the number of people who could use the technology increased'

In [11]:
def text_cleaning(text):
    doc = nlp(text)
    cleaned_text = ' '.join([token.lemma_ for token in doc])
    cleaned_text = re.sub(r'([^\s\w]|_)+', ' ', cleaned_text)

    return re.sub(r'\s+', ' ', cleaned_text).strip()   

In [12]:
#df2 = df2.iloc[:200]

In [13]:
df2.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive
...,...,...
195,i own a remarkable 2 and that feeling surreal ...,positive
196,man i love how he went from a smartphone focus...,positive
197,thankyou for the effort you put into your vide...,positive
198,that desk pc is now on my bucket list,positive


In [14]:
df2['cleaned_comment'] = df2['Comment']\
    .apply(text_cleaning)

In [15]:
df2

Unnamed: 0,Comment,Sentiment,cleaned_comment
0,lets not forget that apple pay in 2014 require...,neutral,lets not forget that apple pay in 2014 require...
1,here in nz 50 of retailers don’t even have con...,negative,here in nz 50 of retailer do not even have con...
2,i will forever acknowledge this channel with t...,positive,I will forever acknowledge this channel with t...
3,whenever i go to a place that doesn’t take app...,negative,whenever I go to a place that do not take appl...
4,apple pay is so convenient secure and easy to ...,positive,apple pay be so convenient secure and easy to ...
...,...,...,...
195,i own a remarkable 2 and that feeling surreal ...,positive,I own a remarkable 2 and that feel surreal to ...
196,man i love how he went from a smartphone focus...,positive,man I love how he go from a smartphone focus c...
197,thankyou for the effort you put into your vide...,positive,thankyou for the effort you put into your vide...
198,that desk pc is now on my bucket list,positive,that desk pc be now on my bucket list


In [16]:
df2 = df2.loc[~(df2.Sentiment == 'neutral')]

In [17]:
X, y = df2['cleaned_comment'], df2['Sentiment']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [19]:
y_train.value_counts()

Sentiment
positive    106
negative     26
Name: count, dtype: int64

In [20]:
y_test.value_counts()

Sentiment
positive    26
negative     7
Name: count, dtype: int64

In [23]:
tfidf_model = TfidfVectorizer(max_features=500)

In [25]:
X_train

110    I m always look forward to these every month s...
3      whenever I go to a place that do not take appl...
120    one really important point those ionizer can p...
61                         I live for johns roast of att
58     y all be pick the literal good professional fo...
                             ...                        
96     my son have botox injection in his leg for yea...
133    as a specialty coffee lover those cometeer pod...
18     lmgs growth be honestly something to truly asp...
118    this series have help I and my friend find som...
129    its so crazy to I I ve watch you all the way t...
Name: cleaned_comment, Length: 132, dtype: object

In [26]:
tfidf_df = pd.DataFrame(tfidf_model.fit_transform(X_train).todense())

In [29]:
tfidf_df.columns = sorted(tfidf_model.vocabulary_)

In [30]:
tfidf_df.head()

Unnamed: 0,100,20,50,80,able,about,absolutely,accept,access,accessory,...,would,wow,write,xbox,year,yet,you,young,your,youtuber
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.112595,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.070898,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
logreg = LogisticRegression()

In [34]:
logreg.fit(tfidf_df,y_train)

In [None]:


logreg = LogisticRegression()

logreg.fit(tfidf_df,review_data['target'])

predicted_labels = logreg.predict(tfidf_df)

logreg.predict_proba(tfidf_df)[:,1]