# 1. Import Libraries and Data

In [1]:
import nltk
import pandas as pd
from pandas import DataFrame
import csv
import matplotlib.pyplot as mplot
from nltk.corpus import stopwords
stop = stopwords.words('english')
import numpy as np
nltk.download('wordnet')
from textblob import TextBlob
from textblob import Word

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\deepa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#reading the csv file
train = pd.read_csv('Scrapped data.csv', index_col = 0)

In [3]:
pd.set_option('expand_frame_repr', False)
print(train)

                                                Title                                             Review
0                Business Owners AVOID at all costs..  American Express Open... openly takes full con...
1   Look elsewhere...Visa, MasterCard or Discover ...  So I requested a credit line increase because ...
2    AMERICAN EXPRESS - THE COMPANY YOU CAN'T RELY ON  Booked a holiday & paid by American Express fo...
3                              Never get an amex card  flights cancelled with airline.Been trying for...
4                     The Platinum Perennial Torture!  The Platinum Perennial Torture!Despite being A...
5                               Good customer support                  Great company. Resolved my issues
6                       Wonderful Customer Service!!!  I've never had customer service answer me so q...
7               I had a dispute opened with TravelUp.  I had a dispute opened with TravelUp.Whereas m...
8                                          The worst!  

# 2. Basic data features to know the details of the reviews

In [4]:
#Count of words
train['word_count'] = train['Review'].apply(lambda x: len(str(x).split(" ")))

#Count of characters
train['char_count'] = train['Review'].str.len() ## this also includes spaces

#Count of stopwords
train['stopwords'] = train['Review'].apply(lambda x: len([x for x in x.split() if x in stop]))

#count of numbers
train['numerics'] = train['Review'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

train[['Review','word_count','char_count','stopwords','numerics']].head()

Unnamed: 0,Review,word_count,char_count,stopwords,numerics
0,American Express Open... openly takes full con...,228,1291,98,1
1,So I requested a credit line increase because ...,111,580,45,0
2,Booked a holiday & paid by American Express fo...,58,339,19,1
3,flights cancelled with airline.Been trying for...,33,188,13,0
4,The Platinum Perennial Torture!Despite being A...,42,264,8,2


# 3. Pre-processing of the data

In [5]:
#converting to lowercase
train['Review'] = train['Review'].apply(lambda x: " ".join(x.lower() for x in x.split()))

#removing punctuations
train['Review'] = train['Review'].str.replace('[^\w\s]','')

#removing stopwords
train['Review'] = train['Review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

#lemmatization-it reduces the word to its root form
train['Review'] = train['Review'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

#Spelling correction
train['Review']=train['Review'].apply(lambda x: str(TextBlob(x).correct()))

train['Review'].head()

0    american express open openly take full control...
1    requested credit line increase american expres...
2    booked holiday paid american express nearly 52...
3    flight canceled airlinebeen trying week get re...
4    platino perennial torturedespite apex big plat...
Name: Review, dtype: object

# 4. Term Frequency

In [6]:
tf1 = (train['Review']).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['Words','TF']
tf1

Unnamed: 0,Words,TF
0,business,6.0
1,express,16.0
2,american,16.0
3,make,6.0
4,pay,12.0
...,...,...
457,informed,1.0
458,40,1.0
459,appeared,1.0
460,appreciate,1.0


# 5. Inverse Document Frequency

In [8]:
for i,word in enumerate(tf1['Words']):
    tf1.loc[i, 'IDF'] = np.log(train.shape[0]/(len(train[train['Review'].str.contains(word)])))
tf1

Unnamed: 0,Words,TF,IDF
0,business,6.0,2.995732
1,express,16.0,0.916291
2,american,16.0,0.916291
3,make,6.0,1.609438
4,pay,12.0,1.049822
...,...,...,...
457,informed,1.0,2.995732
458,40,1.0,2.302585
459,appeared,1.0,2.995732
460,appreciate,1.0,2.995732


# 6. TF-IDF for each term

In [10]:
tf1['TF-IDF'] = tf1['TF'] * tf1['IDF']
tf1

Unnamed: 0,Words,TF,IDF,TF-IDF
0,business,6.0,2.995732,17.974394
1,express,16.0,0.916291,14.660652
2,american,16.0,0.916291,14.660652
3,make,6.0,1.609438,9.656627
4,pay,12.0,1.049822,12.597865
...,...,...,...,...
457,informed,1.0,2.995732,2.995732
458,40,1.0,2.302585,2.302585
459,appeared,1.0,2.995732,2.995732
460,appreciate,1.0,2.995732,2.995732


# 7. Saving the FIle

In [11]:
tf1.to_csv(r'TFIDF.csv')