In [1]:
# DATA CLEANING STEP

In [2]:
from collections import Counter
from collections import defaultdict

import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
# nltk.download('punkt')
# nltk.download('wordnet')

In [3]:
# Create dataframe from the training datset file
df_train = pd.read_csv('datasets/train3.csv', header='infer', low_memory=False)

# Create dataframe from the testing datset file
df_test = pd.read_csv('datasets/test3.csv', header='infer', low_memory=False)

In [4]:
# Check if dataframes looks good
df_train

Unnamed: 0,Text,Class,ID
0,Fun little place to stop and have lunch There...,positive,1307144
1,This place was PACKED Went for late night foo...,negative,5544361
2,We board the plane 30 minutes before the actua...,positive,5956200
3,Chaotic 4 story place but fun for kids The w...,positive,718388
4,After (intentionally) capsizing a sailboat in ...,neutral,174754
...,...,...,...
55995,My new fav restaurant by far!!!! Its hard to...,positive,3794413
55996,Not sure why the great reviews she abraded my...,neutral,3946261
55997,Food was okay We had the all you can eat buff...,neutral,1237126
55998,Great place for after work drinks!!! Went wit...,positive,1532373


In [5]:
df_test

Unnamed: 0,ID,Text
0,178,!:)I could pass out here and survive the zombi...
1,344,15% tips sneaked into our bill even though we ...
2,2324,How bad could it be? I thought After all its...
3,3217,Jordan is awesome! I have a lot of muscle tens...
4,3705,Micheals Arts and CraftsWell they live under ...
...,...,...
13995,6662447,wifi doesnt work Also even it works it is v...
13996,6662575,wish I could select zero stars This place wa...
13997,6664830,ya know this place is so good and so unassumi...
13998,6665591,you definitely dont want to bring a first date...


In [6]:
# View all columns and their data types
df_train.info()
print('\n')
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56000 entries, 0 to 55999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    56000 non-null  object
 1   Class   56000 non-null  object
 2   ID      56000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 875.1+ KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14000 entries, 0 to 13999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      14000 non-null  int64 
 1   Text    14000 non-null  object
dtypes: int64(1), object(1)
memory usage: 164.1+ KB


In [7]:
# Remove ID columns from both dataframes
del df_train['ID']
del df_test['ID']

In [8]:
# Word Extraction

# Convert all words to lowercase to avoid duplicate words in different fields
print('Before \n' + df_train['Text'][1] + '\n')
df_train['Text'] = df_train['Text'].str.lower()
df_test['Text'] = df_test['Text'].str.lower()
print('After \n' + df_train['Text'][1])

Before 
This place was PACKED  Went for late night food  They serve late which is awesome  Ended up having to get our food to go because they had 1 cook & he got slammed w/ orders  I got pork schnitzel sandwich  The breading was dark but still good   however there was NO  potato salad on sandwich as described   Disappointing   Boyfriend ordered the spicy burger ($9) He said it was OK  He wasnt super impressed  Waffle fries were good  Prices are reasonable  Debating whether wed give them another shot 

After 
this place was packed  went for late night food  they serve late which is awesome  ended up having to get our food to go because they had 1 cook & he got slammed w/ orders  i got pork schnitzel sandwich  the breading was dark but still good   however there was no  potato salad on sandwich as described   disappointing   boyfriend ordered the spicy burger ($9) he said it was ok  he wasnt super impressed  waffle fries were good  prices are reasonable  debating whether wed give them an

In [9]:
# Replace some slang and word meanings based on our judgement in the dataset

In [10]:
# Replace some 'hello' in other languages than english usually being correlated to positive reviews
hellos = ['aloha', 'bonjour', 'namaste']

for hello in hellos:
    df_train['Text'] = df_train['Text'].str.replace(hello, 'happy', regex=False)
    df_test['Text'] = df_test['Text'].str.replace(hello, 'happy', regex=False)

In [11]:
# Replace some short terms usually correlating with reviews
goods = ['lol', 'atm']
bads = ['smh', ' fk', 'lmfao', 'rip', 'wtf']
neutrals = ['aight']

for good in goods:
    df_train['Text'] = df_train['Text'].str.replace(good, 'happy', regex=False)
    df_test['Text'] = df_test['Text'].str.replace(good, 'happy', regex=False)

for bad in bads:
    df_train['Text'] = df_train['Text'].str.replace(bad, 'bad', regex=False)
    df_test['Text'] = df_test['Text'].str.replace(bad, 'bad', regex=False)
    
for neutral in neutrals:
    df_train['Text'] = df_train['Text'].str.replace(neutral, 'neutral', regex=False)
    df_test['Text'] = df_test['Text'].str.replace(neutral, 'neutral', regex=False)

In [12]:
# Replace variations of smiley faces to correlate to happy and bad
happy = [':)', ': )', ':  )', ':]', ': ]', ':  ]', ':}', ': }', ':  }', ';)', '; )', ';  )', ';]', '; ]', ';  ]', ';}', '; }', ';  }']
bad = [':(', ': (', ':  (', ':[', ': [', ':  [', ':{', ': {', ':  {', ';(', '; (', ';  (', ';[', '; [', ';  [', ';{', '; {', ';  {']

print('Before \n' + df_train['Text'][2916] + '\n')

for face in happy:
    df_train['Text'] = df_train['Text'].str.replace(face, 'happy', regex=False)
    df_test['Text'] = df_test['Text'].str.replace(face, 'happy', regex=False)

for face in bad:
    df_train['Text'] = df_train['Text'].str.replace(face, 'bad', regex=False)
    df_test['Text'] = df_test['Text'].str.replace(face, 'bad', regex=False)
    
print('After \n' + df_train['Text'][2916])

Before 
decided to finally try out this spot since we had passed it at least once a week heading to citizen for coffee  its hard to find parking for this little joint so you may need to walk a little but we felt it was definitely worth it  we sat at the tables outdoors where they have a nice little patio setup  the staff were very friendly and if you go for brunch you can start your day with a nice mimosa bucket  they have everything from breakfast items  sandwiches  salads  and meat entries  the food was great it tasted home grown with only the best ingredients  we will definitely be coming back in the future  only we wont wait so long this time : )

After 
decided to finally try out this spot since we had passed it at least once a week heading to citizen for coffee  its hard to find parking for this little joint so you may need to walk a little but we felt it was definitely worth it  we sat at the tables outdoors where they have a nice little patio setup  the staff were very friendly

In [13]:
# Word Placing Fixes

In [14]:
# Add a space between some special characters so we can later easily distinguish them into seperate words
print('Before \n' + df_train['Text'][14] + '\n')
df_train['Text'] = df_train['Text'].str.replace('!', ' ! ', regex=False)
df_train['Text'] = df_train['Text'].str.replace('?', ' ? ', regex=False)
print('After \n' + df_train['Text'][14] + '\n')

df_test['Text'] = df_test['Text'].str.replace('!', ' ! ', regex=False)
df_test['Text'] = df_test['Text'].str.replace('?', ' ? ', regex=False)

# Remove some special characters that have no significance
df_train['Text'] = df_train['Text'].str.replace('(', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace(')', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace('[', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace(']', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace('{', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace('}', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace('/', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace('@', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace('&', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace('*', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace(';', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace(':', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace('~', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace('|', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace('^', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace('<', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace('>', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace('_', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace('#', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace('%', ' ', regex=False)
df_train['Text'] = df_train['Text'].str.replace('@', ' ', regex=False)

df_test['Text'] = df_test['Text'].str.replace('(', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace(')', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace('[', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace(']', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace('{', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace('}', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace('/', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace('@', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace('&', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace('*', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace(';', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace(':', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace('~', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace('|', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace('^', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace('<', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace('>', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace('_', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace('#', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace('%', ' ', regex=False)
df_test['Text'] = df_test['Text'].str.replace('@', ' ', regex=False)

Before 
i love pizza!!tonys has a great lunch special  a slice of pizza with a side salad or garlic knots and a drink for like $6 50 the pizza is great! i have been just for a slice and also for a pie with the fam! my dad used to get the sausage and peppers and really liked it  the salad is good with the mozzarella on top of it and the italian dressing its perfect for a quick bite or to pick up  i do suggest if you dine in and want a pie  to order everything beforehand and then get there and enjoy it without the wait!

After 
i love pizza !  ! tonys has a great lunch special  a slice of pizza with a side salad or garlic knots and a drink for like $6 50 the pizza is great !  i have been just for a slice and also for a pie with the fam !  my dad used to get the sausage and peppers and really liked it  the salad is good with the mozzarella on top of it and the italian dressing its perfect for a quick bite or to pick up  i do suggest if you dine in and want a pie  to order everything befor

In [15]:
# Stop Word Removal

In [16]:
# Remove words included in stop_words.txt
print('Before \n' + df_train['Text'][0] + '\n')

with open('datasets/stop_words.txt', 'r') as file:
    data = file.read().splitlines()

for d in data:
    df_train['Text'] = df_train['Text'].str.replace(' ' + d + ' ',' ')
    df_test['Text'] = df_test['Text'].str.replace(' ' + d + ' ',' ')

print('After \n' + df_train['Text'][0])

Before 
fun little place to stop and have lunch  there was plenty of parking out back  and more tables on the patio than there was inside  the lunch menu has all the basics  i chose the turkey sandwich which was very moist and tasty  i also ordered a strawberry banana smoothie which hit the spot on this hot day  this a dog friendly place that has tie outs here and there 

After 
fun little place stop lunch  plenty parking  tables patio inside  lunch menu basics  chose turkey sandwich moist tasty  ordered strawberry banana smoothie hit spot hot day  dog friendly place tie outs 


In [17]:
# Character removal

In [18]:
# Remove redundant '.' & ','

df_train['Text'] = df_train['Text'].str.replace('.', '', regex=False)
df_train['Text'] = df_train['Text'].str.replace(',', '', regex=False)

df_test['Text'] = df_test['Text'].str.replace('.', '', regex=False)
df_test['Text'] = df_test['Text'].str.replace(',', '', regex=False)

In [19]:
# Remove words that arent in the dictionary

print('Before \n' + df_train['Text'][137] + '\n')
regex = re.compile('[@_!#$%^&*()<>?/\|}{~:]')
def valid_words(Text):
    tokenized_words = nltk.word_tokenize(Text)
    tokenized_sentence = []
    for word in tokenized_words:
        if wordnet.synsets(word):
            tokenized_sentence.append(word)
        elif len(word) is 1 and regex.search(word) is not None:
            tokenized_sentence.append(word)
    tokenized_sentence = " ".join(tokenized_sentence)
    return tokenized_sentence

df_train['Text'] = df_train['Text'].apply(valid_words)
df_test['Text'] = df_test['Text'].apply(valid_words)
print('After \n' + df_train['Text'][137])

Before 
i minus  scheduled appointment seen sinus infection drainage ear canals  informed  dr  chiara symptoms  going couple months  told severe hearing loss  prescribed antibiotic  created severe gi distress scheduled meet audiologist hearing aids examining ear s determine need hearing aids  looking new better ent  dont waste time  place laughable !  !  ! 

After 
i minus scheduled appointment seen sinus infection drainage ear canals informed symptoms going couple months told severe hearing loss prescribed antibiotic created severe gi distress scheduled meet hearing aids examining ear s determine need hearing aids looking new better waste time place laughable ! ! !


In [20]:
# Remove single letter words

df_train['Text'] = ' ' + df_train['Text']
df_test['Text'] = ' ' + df_test['Text']

print('Before \n' + df_train['Text'][137] + '\n')
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
for letter in alphabet:
    df_train['Text'] = df_train['Text'].str.replace(' ' + letter + ' ', ' ', regex=False)
    df_test['Text'] = df_test['Text'].str.replace(' ' + letter + ' ', ' ', regex=False)
    
print('After \n' + df_train['Text'][137])

Before 
 i minus scheduled appointment seen sinus infection drainage ear canals informed symptoms going couple months told severe hearing loss prescribed antibiotic created severe gi distress scheduled meet hearing aids examining ear s determine need hearing aids looking new better waste time place laughable ! ! !

After 
 minus scheduled appointment seen sinus infection drainage ear canals informed symptoms going couple months told severe hearing loss prescribed antibiotic created severe gi distress scheduled meet hearing aids examining ear determine need hearing aids looking new better waste time place laughable ! ! !


In [21]:
# Remove all numeric values in the text

print('Before \n' + df_train['Text'][200] + '\n')
df_train['Text'] = df_train['Text'].str.replace('\d+', '', regex=True)
df_test['Text'] = df_test['Text'].str.replace('\d+', '', regex=True)
print('After \n' + df_train['Text'][200])

Before 
 dog teddy groomed dropped let know exactly wanted told use 3 4 inch clippers leave paws fluffy head went pick mortified life did come drop receptionist went grab know head oddly shaped small fur legs longer index finger oh paws completely shaved soap body hair definitely 3 4 inch face shaved small unbearable haircut offered talk touch decided best touch hair

After 
 dog teddy groomed dropped let know exactly wanted told use   inch clippers leave paws fluffy head went pick mortified life did come drop receptionist went grab know head oddly shaped small fur legs longer index finger oh paws completely shaved soap body hair definitely   inch face shaved small unbearable haircut offered talk touch decided best touch hair


In [22]:
# Omitted at the moment, the stemming seems to really make the words too vague. Ex) exactly -> exactli
# Apply Porter Stemming Algorithim

# print('Before \n' + df_train['Text'][2] + '\n')
# porter = PorterStemmer()

# def stem_text(Text):
#     tokenized_words = nltk.word_tokenize(Text)
#     tokenized_sentence = []
#     for word in tokenized_words:
#         tokenized_sentence.append(porter.stem(word))
#     tokenized_sentence = " ".join(tokenized_sentence)
#     return tokenized_sentence

# df_train['Text'] = df_train['Text'].apply(stem_text)
# df_test['Text'] = df_test['Text'].apply(stem_text)
# print('After \n' + df_train['Text'][2])

In [23]:
# Apply Lemmatization to the dataset

print('Before \n' + df_train['Text'][137] + '\n')
lemmatizer = WordNetLemmatizer()

def lemmatize_text(Text):
    tokenized_words = nltk.word_tokenize(Text)
    tokenized_sentence = []
    for word in tokenized_words:
        new_word = lemmatizer.lemmatize(word, pos="n")
        new_word = lemmatizer.lemmatize(new_word, pos="a")
        new_word = lemmatizer.lemmatize(new_word, pos="v")
        tokenized_sentence.append(new_word)
    tokenized_sentence = " ".join(tokenized_sentence)
    return tokenized_sentence

df_train['Text'] = df_train['Text'].apply(lemmatize_text)
df_test['Text'] = df_test['Text'].apply(lemmatize_text)
print('After \n' + df_train['Text'][137])

Before 
 minus scheduled appointment seen sinus infection drainage ear canals informed symptoms going couple months told severe hearing loss prescribed antibiotic created severe gi distress scheduled meet hearing aids examining ear determine need hearing aids looking new better waste time place laughable ! ! !

After 
minus schedule appointment see sinus infection drainage ear canal inform symptom go couple month tell severe hear loss prescribe antibiotic create severe gi distress schedule meet hear aid examine ear determine need hear aid look new good waste time place laughable ! ! !


In [24]:
# Create new column which will have the reviews represented as a word list

# Turn the datafram column for reviews into a list of words for easier analysis
df_train['Text_As_Word_List'] = df_train.Text.apply(lambda x: x.split(' '))
df_test['Text_As_Word_List'] = df_test.Text.apply(lambda x: x.split(' '))

# See how many reviews contain a set of words or not
df_train.Text_As_Word_List.map(set(['leave', '!']).issubset).value_counts()

False    54496
True      1504
Name: Text_As_Word_List, dtype: int64

In [25]:
# Create new column which will have the reviews represented as a frequency count dictionary

def convert_to_dict(Text_As_Word_List):
    words_to_count = (word for word in Text_As_Word_List)
    c = Counter(words_to_count)
    freq = c.most_common(len(c))
    dict = defaultdict(list)
    for k, v in freq:
        dict[k].append(v)
    return dict

df_train['Text_As_Freq_Dict'] = df_train['Text_As_Word_List'].apply(convert_to_dict)
df_test['Text_As_Freq_Dict'] = df_test['Text_As_Word_List'].apply(convert_to_dict)

print(df_train['Text_As_Freq_Dict'][0])

defaultdict(<class 'list'>, {'place': [2], 'lunch': [2], 'fun': [1], 'little': [1], 'stop': [1], 'plenty': [1], 'park': [1], 'table': [1], 'patio': [1], 'inside': [1], 'menu': [1], 'basic': [1], 'choose': [1], 'turkey': [1], 'sandwich': [1], 'moist': [1], 'tasty': [1], 'order': [1], 'strawberry': [1], 'banana': [1], 'smoothie': [1], 'hit': [1], 'spot': [1], 'hot': [1], 'day': [1], 'dog': [1], 'friendly': [1], 'tie': [1], 'out': [1]})


In [26]:
# Get the most occured words in the dataset

Counter(" ".join(df_train["Text"]).split()).most_common(100)

[('!', 70240),
 ('good', 27716),
 ('place', 25812),
 ('food', 24380),
 ('great', 23104),
 ('service', 18470),
 ('time', 18048),
 ('come', 15482),
 ('like', 14466),
 ('order', 13349),
 ('just', 13090),
 ('go', 12025),
 ('get', 11108),
 ('love', 10767),
 ('try', 10681),
 ('really', 10402),
 ('price', 8630),
 ('best', 8602),
 ('nice', 8469),
 ('say', 8139),
 ('staff', 8122),
 ('friendly', 7946),
 ('$', 7867),
 ('make', 7429),
 ('look', 7382),
 ('wait', 7248),
 ('do', 7129),
 ('recommend', 7070),
 ('restaurant', 6919),
 ('amaze', 6850),
 ('definitely', 6796),
 ('want', 6699),
 ('eat', 6633),
 ('?', 6266),
 ('work', 6252),
 ('chicken', 5968),
 ('delicious', 5955),
 ('experience', 5899),
 ('drink', 5732),
 ('need', 5719),
 ('take', 5666),
 ('little', 5579),
 ('bad', 5554),
 ('day', 5548),
 ('happy', 5511),
 ('know', 5484),
 ('think', 5269),
 ('people', 5213),
 ('taste', 4987),
 ('ask', 4885),
 ('customer', 4878),
 ('pretty', 4840),
 ('menu', 4799),
 ('pizza', 4752),
 ('new', 4735),
 ('fry', 

In [27]:
# View the finalized schema

df_train

Unnamed: 0,Text,Class,Text_As_Word_List,Text_As_Freq_Dict
0,fun little place stop lunch plenty park table ...,positive,"[fun, little, place, stop, lunch, plenty, park...","{'place': [2], 'lunch': [2], 'fun': [1], 'litt..."
1,place pack go late night food serve late aweso...,negative,"[place, pack, go, late, night, food, serve, la...","{'late': [2], 'food': [2], 'get': [2], 'order'..."
2,board plane minute actual flight make sure lea...,positive,"[board, plane, minute, actual, flight, make, s...","{'flight': [4], '!': [4], 'minute': [3], 'hour..."
3,chaotic story place fun kid white chocolate m ...,positive,"[chaotic, story, place, fun, kid, white, choco...","{'chaotic': [1], 'story': [1], 'place': [1], '..."
4,after intentionally capsize sailboat spend goo...,neutral,"[after, intentionally, capsize, sailboat, spen...","{'after': [1], 'intentionally': [1], 'capsize'..."
...,...,...,...,...
55995,new restaurant far ! ! ! ! hard remember time ...,positive,"[new, restaurant, far, !, !, !, !, hard, remem...","{'!': [5], 'new': [1], 'restaurant': [1], 'far..."
55996,not sure great review abrade big toe keep get ...,neutral,"[not, sure, great, review, abrade, big, toe, k...","{'?': [4], 'not': [1], 'sure': [1], 'great': [..."
55997,food okay eat buffet waiter keep get order wro...,neutral,"[food, okay, eat, buffet, waiter, keep, get, o...","{'food': [1], 'okay': [1], 'eat': [1], 'buffet..."
55998,great place work drink ! ! ! go team friday dr...,positive,"[great, place, work, drink, !, !, !, go, team,...","{'!': [4], 'drink': [2], 'go': [2], 'great': [..."


In [28]:
df_test

Unnamed: 0,Text,Text_As_Word_List,Text_As_Freq_Dict
0,! pas survive zombie apocalypse bathroom decor...,"[!, pas, survive, zombie, apocalypse, bathroom...","{'!': [9], 'like': [2], '?': [2], 'spend': [2]..."
1,tip sneak get crappy service busy obvious reas...,"[tip, sneak, get, crappy, service, busy, obvio...","{'ask': [4], 'service': [2], 'come': [2], 'lit..."
2,bad ? think close home winter ill wear hat per...,"[bad, ?, think, close, home, winter, ill, wear...","{'home': [2], 'ill': [2], 'hat': [2], 'say': [..."
3,jordan awesome ! lot muscle tension cause head...,"[jordan, awesome, !, lot, muscle, tension, cau...","{'jordan': [1], 'awesome': [1], '!': [1], 'lot..."
4,art live ! art craft book paint brush art tool...,"[art, live, !, art, craft, book, paint, brush,...","{'art': [3], 'stuff': [2], 'live': [1], '!': [..."
...,...,...,...
13995,wifi work work stupid customer bring laptop co...,"[wifi, work, work, stupid, customer, bring, la...","{'work': [2], 'wifi': [1], 'stupid': [1], 'cus..."
13996,wish select zero star place bad anticipate bui...,"[wish, select, zero, star, place, bad, anticip...","{'bad': [2], 'minute': [2], 'doctor': [2], 'ca..."
13997,know place good unassuming great food awesome ...,"[know, place, good, unassuming, great, food, a...","{'really': [3], 'good': [2], 'food': [2], 'awe..."
13998,definitely want bring date ready sloppy lick f...,"[definitely, want, bring, date, ready, sloppy,...","{'lot': [2], 'definitely': [1], 'want': [1], '..."


In [29]:
# View the finalized schema statistics

df_train.info()
print('\n')
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56000 entries, 0 to 55999
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Text               56000 non-null  object
 1   Class              56000 non-null  object
 2   Text_As_Word_List  56000 non-null  object
 3   Text_As_Freq_Dict  56000 non-null  object
dtypes: object(4)
memory usage: 875.1+ KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14000 entries, 0 to 13999
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Text               14000 non-null  object
 1   Text_As_Word_List  14000 non-null  object
 2   Text_As_Freq_Dict  14000 non-null  object
dtypes: object(3)
memory usage: 164.1+ KB


In [30]:
# Write out the pre processed training and test dataframes to a csv file

df_train['Text_As_Freq_Dict'] = df_train['Text_As_Freq_Dict'].astype(str)
df_test['Text_As_Freq_Dict'] = df_test['Text_As_Freq_Dict'].astype(str)

df_train['Text_As_Freq_Dict'] = df_train['Text_As_Freq_Dict'].str.slice(28, -1)
df_test['Text_As_Freq_Dict'] = df_test['Text_As_Freq_Dict'].str.slice(28, -1)

df_train.to_csv('cleaned_datasets/cleaned_training_data.csv', index=False)
df_test.to_csv('cleaned_datasets/cleaned_test_data.csv', index=False)

# Done