In [53]:
# Import all libraries here
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk as nltk
import numpy as np 
import time;
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kalee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [54]:
# Extract text from given train data and corresponding labels
result = pd.read_json('../data/data_train.json')

# Collecting all the text reviews into a list called corpus.
corpus = []
labels = []
for index, row in result.iterrows():
    corpus.append(row['text'])
    labels.append(row['stars'])

In [47]:
# Data preproessing (removing things like hunt, hunting, hunted -> hunt)
# Stemming (hunt, hunting, hunted -> hunt) (try lemmazation later)

print ("Local current time :", time.asctime( time.localtime(time.time()) )) # This part takes the longest (~30 min)
ps = PorterStemmer()
corpus_stemmed = []
for text in corpus:
    words = word_tokenize(text)
    new_text = ""
    for word in words:
        stemmed = ps.stem(word)
        new_text = new_text + " " + stemmed
    corpus_stemmed.append(new_text)
print ("Local current time :", time.asctime( time.localtime(time.time()) ))

Local current time : Tue Dec  3 11:23:15 2019
Local current time : Tue Dec  3 13:51:25 2019


In [49]:
# Writing labels to a file
row_list = []
for each in labels: #each row consists of one column which is the stemmed text
    text = []
    text.append(each)
    row_list.append(text)
print(len(row_list))
with open('../data/corpus_labels.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerows(row_list)

334295


In [50]:
# Feature extraction using TfidfVectorizer
# To make the extracted features as minimal as possible, but still retain usefulness, hyperparameters have been tuned

print ("Feature extraction: Local current time :", time.asctime( time.localtime(time.time()) ))
vectorizer = TfidfVectorizer(min_df = 0.001, max_df = 0.5)
vectorizer.fit_transform(corpus_stemmed)
features = vectorizer.get_feature_names()
print(len(features)) # should be 3800 features
print ("Ending feature extraction: Local current time :", time.asctime( time.localtime(time.time()) ))

Feature extraction: Local current time : Tue Dec  3 13:56:36 2019
3800
Ending feature extraction: Local current time : Tue Dec  3 13:57:42 2019


In [51]:
# Using the features extracted, we transform our training data into training instances represented by feature vectors
feature_vector = vectorizer.transform(corpus_stemmed)

import scipy.sparse
print(type(feature_vector))
scipy.sparse.save_npz('../data/corpus_feature_vectors.npz', feature_vector)

(334295, 3800)
