In [None]:
pip install nltk

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk import word_tokenize, pos_tag, pos_tag_sents

### Import modules needed

In [None]:
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger') 
nltk.download('punkt')
nltk.download('wordnet')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords


from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet 
# Create WordNetLemmatizer object 

sid = SentimentIntensityAnalyzer()
lemmatizer = WordNetLemmatizer() 

### Customized POS Tagger

In [None]:
def pos_tagger(nltk_tag): 
    if nltk_tag.startswith('J'): 
        return wordnet.ADJ 
    elif nltk_tag.startswith('V'): 
        return wordnet.VERB 
    elif nltk_tag.startswith('N'): 
        return wordnet.NOUN 
    elif nltk_tag.startswith('R'): 
        return wordnet.ADV 
    else:           
        return None

In [None]:
df = pd.read_csv('opinion_survey.csv')
df = df.replace(np.nan, 'Neutral', regex=True) ### Some values are missing, use "Neutral" for missing values
df = df.drop(df.columns[[0]],axis=1) ### Drop extra index
test_df = df.iloc[0:, 0:] # Just for renaming purposes so that df variable can be used for testing
test_df

In [None]:
stoplist = stopwords.words('english') + ['though']
def remove_stopWords(w): 
    w = ' '.join(word for word in w.split() if word not in stoplist)
    return w

In [None]:
col_range = len(test_df.columns) # number of columns

for i in range(0,col_range):
    col = test_df.columns[i] # The current column
    test_df.loc[:,col] = test_df[col].apply(lambda x : str.lower(str(x))) ## To Lower Case
    test_df.loc[:,col] = test_df[col].apply(lambda x : " ".join(re.findall('[\w]+',x))) # Remove Punctuations
    test_df.loc[:,col] = test_df[col].apply(lambda x : remove_stopWords(x)) # Remove Stop words
    
    ##POS TAGGING
    texts = test_df.loc[:,col].tolist()
    tagged_texts = pos_tag_sents(map(word_tokenize, texts)) ### Tag every word in a row with POS
    
    ### Lemmatization
    new = []
    for i in tagged_texts:
        #if len(i) > 0:
        lemmatized_sentence = []
        for word, tag in i:
            tag = pos_tagger(tag) ### Convert POS Tag to known POS for simplification
            if tag is None: 
    # if there is no available tag, append the token as is 
                lemmatized_sentence.append(word) 
            else:         
    # else use the tag to lemmatize the token 
                lemmatized_sentence.append(lemmatizer.lemmatize(word, tag)) 

        lemmatized_sentence = " ".join(lemmatized_sentence) 
        #print(lemmatized_sentence)
        new.append(lemmatized_sentence)
    else:
        pass
    

    test_df['POS'] = new ## Store tagged words
    
test_df = test_df.replace(r'^\s*$', "neutral", regex=True) ## If row value is null, replace with neutral string
test_df

In [None]:
test_df.to_csv('lemmatized_opinion.csv', index=False)

### Sample POS Tags from first row

In [None]:
from pprint import pprint 

pprint(tagged_texts[:20])

In [None]:
lem_df = pd.read_csv('lemmatized_opinion.csv')
lem_df = lem_df.iloc[0:, 0:-1] # Remove last columns
lem_df

In [None]:
comp = []
col_range = len(lem_df.columns) # number of columns

for i in range(0,col_range):
    col = test_df.columns[i] # The current column
    test_df['scores'] = test_df[col].apply(lambda x: sid.polarity_scores(x)) ## Get polarity score of every Column
    compound = test_df['scores'].apply(lambda score_dict: score_dict['compound']) ## Extract the compound from the results
    test_df = test_df.drop('scores', 1) # Drop score DF in every iteration
    compound = sum(compound)/140 # Get the mean compound of each columns
    comp.append(compound) # Save mean and append to list

In [None]:
from pprint import pprint

pprint(comp)

## Comparisons

In [None]:
df_num = pd.read_csv('opinion_sentiment_score.csv')
cor_num = df_num.iloc[0:, 0:7:6]
cor_num


In [None]:
df_ngram = pd.read_csv('ngram_sentiment_score.csv')
#ngram_list = df_ngram.values.tolist()
ngram_list = df_ngram.values.flatten()

In [None]:

cor_num = cor_num.rename({'Sentiment Score': 'Without Lemmanization'}, axis=1) 
cor_num["With Lemmanization"] = comp
cor_num["Lemmatized and Ngram"] = ngram_list

cor_num.style.set_caption('Comparison of Sentiment Score results')

In [None]:
cor_num.to_csv("Sentiment_Comparison.csv",index=False)