# Intro to AI
Understanding how machine make sense of text data

In [34]:
# !pip install regex pandas inflect numpy contractions scikit-learn matplotlib wordcloud nltk

In [2]:
import regex as re
import pandas as pd
import inflect
import numpy as np
import contractions
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.metrics import accuracy_score

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer

[nltk_data] Error loading punkt: <urlopen error [WinError 10054] An
[nltk_data]     existing connection was forcibly closed by the remote
[nltk_data]     host>
[nltk_data] Error loading stopwords: <urlopen error [WinError 10054]
[nltk_data]     An existing connection was forcibly closed by the
[nltk_data]     remote host>


In [4]:
unclean_data = pd.read_csv("Amazon-Product-Reviews-Sentiment-Analysis-in-Python-Dataset (1).csv")
unclean_data.head(10)

Unnamed: 0,Review,Sentiment
0,Fast shipping but this product is very cheaply...,1
1,This case takes so long to ship and it's not e...,1
2,Good for not droids. Not good for iPhones. You...,1
3,The cable was not compatible between my macboo...,1
4,The case is nice but did not have a glow light...,1
5,The cable keeps coming up with message that th...,1
6,This pos broke off in my phone after 3 uses an...,1
7,This product suck its hard 2 hear wat other ar...,1
8,Music cuts off within 30 secs. I like Aukey's ...,1
9,Yeah when they say mirror screen they mean it ...,1


In [5]:
unclean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Review     24999 non-null  object
 1   Sentiment  25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 390.8+ KB


In [6]:
unclean_data.describe()

Unnamed: 0,Sentiment
count,25000.0
mean,3.0
std,1.414242
min,1.0
25%,2.0
50%,3.0
75%,4.0
max,5.0


## Data Cleaning

In [7]:
# remove any Missing values
unclean_data.dropna(inplace=True)

In [8]:
# Replace Sentiment Columns with Positive == 1 and Negative == 0
unclean_data['Sentiment'] = unclean_data['Sentiment'].astype(int)
unclean_data["Sentiment"] = np.where(unclean_data['Sentiment'] <= 3, 0, 1)
unclean_data.head(10)

Unnamed: 0,Review,Sentiment
0,Fast shipping but this product is very cheaply...,0
1,This case takes so long to ship and it's not e...,0
2,Good for not droids. Not good for iPhones. You...,0
3,The cable was not compatible between my macboo...,0
4,The case is nice but did not have a glow light...,0
5,The cable keeps coming up with message that th...,0
6,This pos broke off in my phone after 3 uses an...,0
7,This product suck its hard 2 hear wat other ar...,0
8,Music cuts off within 30 secs. I like Aukey's ...,0
9,Yeah when they say mirror screen they mean it ...,0


In [9]:
# Get the number of positive and negative sentiments
unclean_data['Sentiment'].value_counts()

Sentiment
0    15000
1     9999
Name: count, dtype: int64

In [10]:
# Drop Duplicate columns
before = unclean_data.shape[0]
unclean_data.drop_duplicates(inplace=True)
after = unclean_data.shape[0]
print(f"Number of Rows dropped", before - after)

Number of Rows dropped 228


## Remove HTML Tags

In [11]:
def remove_html(text):
    html_tag = re.compile(r'<.*?>')
    text = html_tag.sub(r'', text)
    return text
unclean_data['Review'] = unclean_data['Review'].apply(remove_html)

In [12]:
unclean_data.head(-100)

Unnamed: 0,Review,Sentiment
0,Fast shipping but this product is very cheaply...,0
1,This case takes so long to ship and it's not e...,0
2,Good for not droids. Not good for iPhones. You...,0
3,The cable was not compatible between my macboo...,0
4,The case is nice but did not have a glow light...,0
...,...,...
24892,The screen protector was very easy to put on a...,1
24893,Good sturdy construction. No chance of warpin...,1
24894,I love it.cheap and good.,1
24896,My ipod is now protected! I love the color and...,1


### Remove Punctuation

In [13]:
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)
unclean_data['Review'] = unclean_data['Review'].apply(remove_punctuation)

In [14]:
unclean_data.head(-100)
# now the text is cleaned, next step is to make the words in lower cases

Unnamed: 0,Review,Sentiment
0,Fast shipping but this product is very cheaply...,0
1,This case takes so long to ship and its not ev...,0
2,Good for not droids Not good for iPhones You c...,0
3,The cable was not compatible between my macboo...,0
4,The case is nice but did not have a glow light...,0
...,...,...
24892,The screen protector was very easy to put on a...,1
24893,Good sturdy construction No chance of warping...,1
24894,I love itcheap and good,1
24896,My ipod is now protected I love the color and ...,1


### Lower Case Each Word

In [15]:
def lower_case(text):
    return text.lower()
unclean_data['Review'] = unclean_data['Review'].apply(lower_case)

In [16]:
unclean_data.head(10)

Unnamed: 0,Review,Sentiment
0,fast shipping but this product is very cheaply...,0
1,this case takes so long to ship and its not ev...,0
2,good for not droids not good for iphones you c...,0
3,the cable was not compatible between my macboo...,0
4,the case is nice but did not have a glow light...,0
5,the cable keeps coming up with message that th...,0
6,this pos broke off in my phone after 3 uses an...,0
7,this product suck its hard 2 hear wat other ar...,0
8,music cuts off within 30 secs i like aukeys pr...,0
9,yeah when they say mirror screen they mean it ...,0


In [17]:
# data is now cleaned
clean_data = unclean_data

In [18]:
tokenizer = TweetTokenizer()
clean_data['Review'] = clean_data['Review'].apply(tokenizer.tokenize)

In [19]:
clean_data

Unnamed: 0,Review,Sentiment
0,"[fast, shipping, but, this, product, is, very,...",0
1,"[this, case, takes, so, long, to, ship, and, i...",0
2,"[good, for, not, droids, not, good, for, iphon...",0
3,"[the, cable, was, not, compatible, between, my...",0
4,"[the, case, is, nice, but, did, not, have, a, ...",0
...,...,...
24995,"[these, cables, lightning, are, far, superior,...",1
24996,"[this, unit, performs, exactly, as, advertised...",1
24997,"[i, had, the, key, cut, at, a, local, hardware...",1
24998,"[i, love, my, case, i, ordered, from, amazon, ...",1


### Lemmatization

In [22]:
lemmatizer = WordNetLemmatizer()
def lemmatize_word(tokens):
    return " ".join([lemmatizer.lemmatize(token) for token in tokens])

clean_data['Review'] = clean_data['Review'].apply(lemmatize_word)

### Split dataset into Train and Test

In [24]:
X = clean_data['Review']
Y = clean_data['Sentiment']
X_train, X_test , Y_train, Y_test = train_test_split(X,Y, test_size= 0.2,shuffle= True ,random_state= 1101)
print("Shape of X_train:" , X_train.shape[0])
print("Shape of Y_train:" , Y_train.shape[0])
print("Shape of X_test:" , X_test.shape[0])
print("Shape of Y_test:" , X_test.shape[0])
print(clean_data['Review'].shape[0])

Shape of X_train: 19816
Shape of Y_train: 19816
Shape of X_test: 4955
Shape of Y_test: 4955
24771


### Vectorization

In [25]:
# Initialize object
vectorizer  = TfidfVectorizer()

# Convert all the reviews into TFIDF
tf_idf_X_train = vectorizer.fit_transform(X_train)
tf_idf_X_test = vectorizer.transform(X_test)
print(tf_idf_X_test.shape) # checking if both have the same number of features
print(tf_idf_X_train.shape)

(4955, 33800)
(19816, 33800)


### Train the Model (RandomForest, LR)

In [27]:
from sklearn.ensemble import RandomForestClassifier
number_of_trees = [50, 100, 150, 200]
for i in number_of_trees:
    random_forest_clf =  RandomForestClassifier(n_estimators=i)
    random_forest_model = random_forest_clf.fit(tf_idf_X_train, Y_train) # fit the model
    predict_random_forest_model_test = random_forest_model.predict(tf_idf_X_test)
    predict_random_forest_model_train = random_forest_model.predict(tf_idf_X_train)
    train_set_accuracy = accuracy_score(predict_random_forest_model_train, Y_train)
    test_set_accuracy = accuracy_score(predict_random_forest_model_test, Y_test)
    print(f"accuracy on the training set with {i} trees: {train_set_accuracy}")
    print(f"accuracy on the test set with {i} trees: {test_set_accuracy}\n")

accuracy on the training set with 50 trees: 0.9985870004037142
accuracy on the test set with 50 trees: 0.7824419778002019

accuracy on the training set with 100 trees: 0.9985870004037142
accuracy on the test set with 100 trees: 0.7927346115035317

accuracy on the training set with 150 trees: 0.9985870004037142
accuracy on the test set with 150 trees: 0.7937436932391524

accuracy on the training set with 200 trees: 0.9985870004037142
accuracy on the test set with 200 trees: 0.7909182643794147



In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
logistic_model = LogisticRegression(max_iter=500, random_state=1101)

param_grid = {
    'C': [0.01 ,0.1, 1, 10, 100], 
    'penalty': ['l2'],  
    'solver': ['lbfgs']  
}
grid_search_logistic = GridSearchCV(estimator=logistic_model, param_grid= param_grid,scoring='accuracy', cv = 5)
grid_search_logistic.fit(tf_idf_X_train, Y_train)

# Best parameters and average accuracy score
print("Best Parameters:", grid_search_logistic.best_params_)
print("Mean Cross-Validation Accuracy:", grid_search_logistic.best_score_)

# Evaluate on the test set
best_log_reg = grid_search_logistic.best_estimator_
test_accuracy = accuracy_score(Y_test, best_log_reg.predict(tf_idf_X_test))
print("Test Accuracy:", test_accuracy)

Best Parameters: {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}
Mean Cross-Validation Accuracy: 0.8279167694718019
Test Accuracy: 0.82744702320888


In [33]:
new_text = ["the customer service was awesome"]
new_vector = vectorizer.transform(new_text)
prediction = best_log_reg.predict(new_vector)
print("Prediction:", prediction)


Prediction: [1]
