# Data Pre-Proceesing
***

Since our approach is to explore prominent themes in the tweets, we have consciously decided to not purge duplicates.
The presence of duplicates (or retweets here) indicate that multiple people agree or share the opinion.

In data processing we first performed tokenization. Tokenization is the process of breaking a stream of textual content into 
words, themes, symbols or some meaningful elements called tokens. Here we generated unigrams, bigrams and trigrams for each and 
every row or tweet in the data. Then we created three new columns in the dataset containing these unigrams, bigrams and trigrams of the tweets present in the respective rows. 
The idea behind this tokenization was to better understand the tweets. We also performed lemmatization on the tokens generated 
from the tweets and created a new column to store this lemmatized words.

Finally we performed Naive Senyiment analysis and added a new column where we tagged each tweet as positive or negetive


In [3]:
import os
import re
import ast   
import nltk
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from scipy.spatial import distance

# change this to work with a different local file
TEXT_FILE = r"data\iPhoneXS_twitter.csv"
EDA_FILE = r"data\iPhoneXS_twitter_eda.csv"

def get_unigrams(text, extra_stop_words=None):  
    stop = stopwords.words()
    if extra_stop_words is not None:
        stop += extra_stop_words

    tokens = nltk.word_tokenize(text)
    tokens = [token.strip(string.punctuation) for token in tokens]
    
    #removing stopwords
    tokens = [item for item in tokens if item not in stop]
    
    #remove empty tokens
    tokens=[token.strip() for token in tokens if token.strip() != '']
    
    return tokens

#function used for wordnet tagging
def get_wordnet_pos(pos_tag):
    # if pos tag starts with 'J'
    if pos_tag.startswith('J'):
        # return wordnet tag "ADJ"
        return wordnet.ADJ

    # if pos tag starts with 'V'
    elif pos_tag.startswith('V'):
        # return wordnet tag "VERB"
        return wordnet.VERB

    # if pos tag starts with 'N'
    elif pos_tag.startswith('N'):
        # return wordnet tag "NOUN"
        return wordnet.NOUN

    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        # be default, return wordnet tag "NOUN"
        return wordnet.NOUN

def lemmatize_words(text, extra_stop_words=None):
    stop_words = stopwords.words('english')
    if extra_stop_words is not None:
        stop_words += extra_stop_words

    tokens = nltk.word_tokenize(text)
    tagged_tokens= nltk.pos_tag(tokens)
    wordnet_lemmatizer = WordNetLemmatizer()
    
    # lemmatize all words in the tagged tokens removing stopwords & punctuation
    lemmatized_words=[wordnet_lemmatizer.lemmatize\
            (word, get_wordnet_pos(tag)) \
            # tagged_tokens is a list of tuples (word, tag)
            for (word, tag) in tagged_tokens \
            # remove stop words
            if word not in stop_words and \
            word not in string.punctuation]
    
    return lemmatized_words

def Posgrams(text):  
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip(string.punctuation) for token in tokens]
    
    #removing stopwords
    stop = stopwords.words()
    tokens = [item for item in tokens if item not in stop]
    
    #remove empty tokens
    tokens=[token.strip() for token in tokens if token.strip()!='']
    
    with open(r"A:\BIA 660 - Web Mining\project\product-buzz-analysis\scraper\positive-words.txt",'r') as f:
        positive_words=[line.strip() for line in f]
    
    positive_tokens=[token for token in tokens if token in positive_words]
    
    return len(positive_tokens)
    
def Neggrams(text):  
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip(string.punctuation) for token in tokens]
    
    #removing stopwords
    stop = stopwords.words()
    tokens = [item for item in tokens if item not in stop]
    
    #remove empty tokens
    tokens=[token.strip() for token in tokens if token.strip()!='']
    
    with open(r"A:\BIA 660 - Web Mining\project\product-buzz-analysis\scraper\negative-words.txt",'r') as f:
        negative_words=[line.strip() for line in f]
    
    negative_tokens=[token for token in tokens if token in negative_words]
    
    return len(negative_tokens)

def comment_data_eda(comments_file_path):
    #reading the data into a pandas dataframe
    comments_df = pd.read_csv(comments_file_path)
    #having a first look at all the columns before going ahead
    print(comments_df.head())
    #converting the created_at column to pandas datetime
    comments_df['created_at'] = pd.to_datetime(comments_df.created_at)
    print(comments_df.info())
    
    # decode byte string to utf-8
    comments_df['text'] = comments_df['text'].apply(ast.literal_eval).str.decode('utf-8')
    # remove 'RT' which denotes re-tweet
    comments_df['text'] = comments_df['text'].apply(lambda x: x.replace('RT', ''))
    #convert all the text to lower case for further text mining
    comments_df['text'] = comments_df['text'].str.lower()
    
    #make new columns to get the unigrams, bigrams, trigrams from each comments
    #making three new columns for unigrams, bigrams and trigrams
    comments_df['unigrams'] = comments_df['text'].apply(get_unigrams)
    comments_df['bigrams'] = comments_df['unigrams'].apply(lambda x: list(nltk.bigrams(x))) 
    comments_df['trigrams'] = comments_df['unigrams'].apply(lambda x: list(nltk.trigrams(x))) 
    
    #Now we will lemmatize each comment and keep the lemmatized output in a new column
    #make a new column to store the lemmatized words
    comments_df['lemmatized_words'] = comments_df['text'].apply(lemmatize_words)

    # Basic sentiment analysis by looking at positive and negative word count
    comments_df['Positive_count'] = comments_df['text'].apply(Posgrams)
    comments_df['Negative_count'] = comments_df['text'].apply(Neggrams)
    comments_df['More_positive_than_Negative']='Neg'
    comments_df.loc[comments_df.Positive_count>comments_df.Negative_count,'More_positive_than_Negative']='Pos'

    return comments_df                                                      

if __name__ == "__main__":
    #first let's have a look at the dataframe
    return_object = comment_data_eda(comments_file_path = os.path.abspath(TEXT_FILE))
    
    print("Now we print the head of the modified data frame with all the new columns") 
    print(return_object.head())

    return_object.to_csv(os.path.abspath(EDA_FILE), sep=',', encoding='utf-8', index=False)


                    id           created_at              user_id  \
0  1069693113013321728  2018-12-03 20:41:26            342431891   
1  1069692711077326848  2018-12-03 20:39:50             50577341   
2  1069691473015107584  2018-12-03 20:34:55  1002325987919474693   
3  1069689874192711680  2018-12-03 20:28:34            581308005   
4  1069689321505153030  2018-12-03 20:26:22            239173653   

       screen_name                                               text  
0          JKNACK3  b'So just saw my friend\xe2\x80\x99s #iPhoneXS...  
1   DudeFaceGeoffy  b'My first selfie on my new iphone xs #Selfie ...  
2  MobilizeStories  b'RT @dharavisual: Shot on iPhone XS Max! #app...  
3     hagiofficial  b'RT @pschiller: Play To Win. \n#Apple #iPhone...  
4    ToddPatrick51  b"@VerizonSupport I have buyer's remorse with ...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10057 entries, 0 to 10056
Data columns (total 5 columns):
id             10057 non-null int64
created_at     