======= For International Students ======= 
Write a program to construct dictionary of corpus EnglishDataset_Assignment1.txt. (there are 25000 IMDB movie reviews) You have to do:
 - Preprocessing: tokenization, stopword removal, remove punctuation, and stemming (simple normalization if needed).
 - Sort terms by term frequency and draw a figure to prof they follow Zipf‘s law (long-tail distribution). 
 - Rank terms by global TF-IDF.
 - Save the result as a txt file. 


# Preprocessing Data

## Import all needed packages
Please install all packages first

!!! Use `nltk.download()` to download all NLTK dataset to 'C:\nltk_data' if you don't have

Please refer to this link: https://www.nltk.org/data.html

In [1]:
import csv
import operator
import nltk
import string
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.probability import FreqDist

## Read data

In [2]:
with open('task2_trainset.csv', encoding='utf-8') as f:
    data = f.read().split('\n')
    data = data[:len(data)-1] #Clear last null row
    f.close()
    
data = [row for row in csv.reader(data, quotechar='"', delimiter=',',quoting=csv.QUOTE_ALL, skipinitialspace=True)]
data = pd.DataFrame(data[1:], columns = data[0]) #Transform to Pandas DataFrame




## Tokenization by using NLTK 

In [3]:
def tokenize_word(sentence):
    return nltk.word_tokenize(sentence)

def lowercase(words):
    return str(words).lower()

#tokenize abstract and title
data['Title_tokenized'] = [tokenize_word(lowercase(data['Title'][i])) for i in range(len(data))]
data['Abstract_tokenized'] = [tokenize_word(lowercase(data['Abstract'][i])) for i in range(len(data))]



## Normalization
1. Remove punctuation and whitespace words
2. Replace number
3. Remove stop words
4. Stemming
5. Lemmatization

In [4]:
def remove_punctuation_whitespace(words): #Remove all punctuation and whitespace characters
    pun_and_ws = string.punctuation+string.whitespace
    new_words = [''.join(c for c in w if c not in pun_and_ws) for w in words]
    new_words = [w for w in new_words if w != '']
    return new_words

def replace_number(words): #Converse number to text. Eg: '1' to 'one'
    p = inflect.engine()
    new_words = [p.number_to_words(w) if w.isdigit() else w for w in words]
    return new_words

def remove_stopwords(words): #Remove stopwords by using nltk.corpus.stopwords
    stop_words = stopwords.words('english')
    new_words = [w for w in words if w not in stop_words]
    return new_words

# I decided to ignore stemming step because it caused a lot of bugs like: 'comedy' -> 'comedi'

def stem_words(words): #Porter seems to be better than Lancaster
    stemmer = LancasterStemmer()
    new_words = [stemmer.stem(w) for w in words]
    return new_words

def lemmatize_verbs(words):
    lemmatizer = WordNetLemmatizer()
    new_words = [lemmatizer.lemmatize(w) for w in words]
    return new_words

def normalize(data):   
    #Remove punctuations and whitespaces
    data = [remove_punctuation_whitespace(words) for words in data]
    data = [remove_stopwords(words) for words in data]
    """
    words = [replace_number(w) for w in words]
    
    # words = [stem_words(w) for w in words] 
    # I decided to ignore stemming step because it caused a lot of bugs like: 'comedy' -> 'comedi'
    words = [lemmatize_verbs(w) for w in words]
    """
    return data

if __name__ == "__main__":
    data['Title_tokenized'] = normalize(data['Title_tokenized'])
    data['Abstract_tokenized'] = normalize(data['Abstract_tokenized'])


## Word Embedding

## TF-IDF Calculation

In [7]:
def tfidf_vectorize(data):
    data = [list_to_string(d) for d in data]
    vectorizer = TfidfVectorizer(smooth_idf=True, analyzer='word', stop_words='english', max_df=0.9)
    vectors = vectorizer.fit_transform(data)
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense().tolist()
    data_vectors = pd.DataFrame(dense, columns=feature_names)
    return data_vectors

def list_to_string(words):
    return ' '.join(w for w in words)

if __name__ == "__main__":
    abstract_vectorize = tfidf_vectorize(data['Abstract_tokenized'])

# Data Representation
1. Get a dic of word frequency. 
> Since there is the word 'would' in top 10, nltk.stopwords doesn't seem good
2. Plot a line chart, it's look like Zipf's law

In [None]:
def nltk_freqdist(words):
    new_words = [small_w for w in words for small_w in w]
    return FreqDist(new_words)

def print_sorted_tf(tf, number):
    sorted_frequency = sorted(tf.items(), key = operator.itemgetter(1), reverse=True)
    print('Top {} high frequency words: \n{}\n'.format(number, sorted_frequency[0:number]))
    
def print_plot_freqdist(tf, number):
    x = range(number)
    y = sorted(list(tf.values()), reverse=True)[:number]
    print('Frequency Distribution: \n', plt.plot(x,y))
    
def get_tfidf(tf, words):
    tfidf = dict([(w,0) for w in tf.keys()])
    #Calculate df
    for w in words:
        distinct_w = set(w)
        for key in distinct_w:
            tfidf[key]+=1
    #Calculate idf & tfidf
    for key in tfidf:
        tfidf[key] = math.log10(len(words)/tfidf[key])
        tfidf[key] *= tf[key]
    return tfidf

def print_sorted_tfidf(tfidf, number):
    sorted_tfidf = sorted(tfidf.items(), key = operator.itemgetter(1), reverse=True)
    print('Top {} high TF-IDF words: \n{}\n'.format(number, sorted_tfidf[0:number]))
    return sorted_tfidf
        
if __name__ == "__main__":
    tf = nltk_freqdist(words) #I found this shorter way with nltk 
    print_sorted_tf(tf, 100)
    tfidf = get_tfidf(tf, words)
    sorted_tfidf = print_sorted_tfidf(tfidf, 100)
    print_plot_freqdist(tf, 100)

# Save file

In [8]:
#Write to file
data.to_csv('trainingdata_tokenized.csv', header=True, index=None)

In [None]:
data