# Import modules

In [178]:
import pandas as pd
import numpy as np
import nltk
import sklearn
import requests
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
from sklearn.utils import shuffle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Read data

In [85]:
def read_data(path_pos, path_neg):
    pos = pd.read_csv(path_pos, sep="\n", header=None, names=['review'])
    pos['positive']=1
    neg = pd.read_csv(path_neg, sep="\n", header=None, names=['review'])
    neg['positive']=0
    combined_df = pos.append(neg)
    combined_df = shuffle(combined_df, random_state=42)
    return(combined_df)

In [86]:
# read in training data
train = read_data(path_pos="Data/IMDb/train/imdb_train_pos.txt",
                  path_neg="Data/IMDb/train/imdb_train_neg.txt")

dev = read_data(path_pos="Data/IMDb/dev/imdb_dev_pos.txt",
                path_neg="Data/IMDb/dev/imdb_dev_neg.txt")

test = read_data(path_pos="Data/IMDb/test/imdb_test_pos.txt",
                 path_neg="Data/IMDb/test/imdb_test_neg.txt")

# Explore data and shuffle

In [98]:
print("No of positive reviews\n-----")
print(train['positive'].value_counts()[1])
print("\nNo of negative reviews\n-----")
print(train['positive'].value_counts()[0])

No of positive reviews
-----
7483

No of negative reviews
-----
7517


# Initiate lemmatizer, stopwords, and token retriever

#### Create a list of unique tokens that have been lemmatized and made lower case

In [151]:
lemmatizer = nltk.stem.WordNetLemmatizer()

In [152]:
def get_tokens(string):
    sentence_split=nltk.tokenize.sent_tokenize(string)
    list_tokens=[]
    for sentence in sentence_split:
      list_tokens_sentence=nltk.tokenize.word_tokenize(sentence)
      for token in list_tokens_sentence:
        list_tokens.append(lemmatizer.lemmatize(token).lower())
    return list_tokens

In [153]:
def get_features(df, vocabulary):
    features_array=[]
    for index, row in df.iterrows():
        tokens=get_tokens(row['review'])
        features=np.zeros(len(vocabulary))
        for i, word in enumerate(vocabulary):
            if word in tokens:
                features[i]=tokens.count(word)
        features_array.append(features)
    return np.asarray(features_array)

#### Create set of stopwords that will be removed later

In [73]:
# take set of stopwords from nltk
stopwords=set(nltk.corpus.stopwords.words('english'))
# manually add more punctuation
stopwords.add(".")
stopwords.add(",")
stopwords.add("--")
stopwords.add("``")
stopwords.add("#")
stopwords.add("@")
stopwords.add(":")
stopwords.add("'s")
stopwords.add("’")
stopwords.add("...")
stopwords.add("n't")
stopwords.add("'re")
stopwords.add("'")
stopwords.add("-")
stopwords.add(";")
stopwords.add("/")
stopwords.add(">")
stopwords.add("<")
stopwords.add("br")
stopwords.add("(")
stopwords.add(")")
stopwords.add("''")
stopwords.add("&")

# Define custom transformers

In [125]:
class getTokensVocab(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        list_tokens=[]
        for index, row in X.iterrows():
          sentence_split=nltk.tokenize.sent_tokenize(row['review'])
          for sentence in sentence_split:
            tokens = nltk.tokenize.word_tokenize(sentence)
            for token in tokens:
              list_tokens.append(lemmatizer.lemmatize(token).lower())
        return(list_tokens)

In [128]:
class sortTokens(BaseEstimator, TransformerMixin):
    def __init__(self, n):
        self.n = n
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        dict_word_freq={}
        for token in X:
            if token in stopwords: continue
            elif token not in dict_word_freq: dict_word_freq[token]=1
            elseb: dict_word_freq[token]+=1
        sorted_tokens = sorted(dict_word_freq.items(), key=lambda x: x[1], reverse=True)[:self.n]
        return(sorted_tokens)

In [131]:
class getVocabulary(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        vocabulary=[]
        for word,frequency in X:
            vocabulary.append(word)
        return(np.asarray(vocabulary))

In [149]:
class getFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, train):
        self.train = train
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        x_train = get_features(self.train, X)
        return(x_train)

# Create transformation pipeline

In [179]:
feature_1_vocab = Pipeline([
    ('tokenize', getTokensVocab()),
    ('sort', sortTokens(100)),
    ('get_vocab', getVocabulary()),
    ('get_features', getFeatures(train)),
    ('minmax_scaler', MinMaxScaler())
])

In [180]:
f1 = feature_1_vocab.fit_transform(train)
print(f1)

[[0.06451613 0.04       0.11111111 ... 0.16666667 0.         0.        ]
 [0.         0.12       0.         ... 0.         0.         0.        ]
 [0.         0.04       0.11111111 ... 0.         0.         0.1       ]
 ...
 [0.         0.         0.         ... 0.         0.         0.3       ]
 [0.19354839 0.04       0.         ... 0.         0.         0.2       ]
 [0.06451613 0.         0.         ... 0.         0.         0.        ]]
