<a href="https://colab.research.google.com/github/cychen116/ExploratoryDataAnalysis_practice/blob/main/05_EDA_practice_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import numpy as np
import pandas as pd
import random 
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
import nltk
import string
from nltk.stem import *
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [14]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.probability import FreqDist

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

# Data Collection

- Text Souce - Obama's Facebook Post
- web scraping

# Data Loading

In [6]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [9]:
data_dir = "/content/drive/My Drive/Colab Notebooks/EMSE 6574_c/Data"
!ls '{data_dir}'

 cities_distance.csv			   ML_Iris
 cities_distance.xlsx			   News_Category_Dataset_v2.json
'Copy of News_Category_Dataset_v2.json'    obamaFB_sample.csv
 datasets_1474_2639_IMDB-Movie-Data.csv    pharma_sales.csv
 datasets_1474_2639_IMDB-Movie-Data.xlsx   pharma_sales.xlsx
 diamonds.csv				   StockPrice
'FEC dataset'				   tweets.json


In [11]:
df = pd.read_csv(f'{data_dir}/obamaFB_sample.csv')
df = df.drop(columns = 'Unnamed: 0')
df.head()

Unnamed: 0,text
0,Part of my goal in writing A Promised Land was...
1,"My memoir, A Promised Land, is out today—and I..."
2,Music has always played an important role thro...
3,"On May 1, 2011, a team of Navy SEALs embarked ..."
4,"""The fact that my own father was largely absen..."


# Data Preprocessing

In [18]:
# Chaining some preprocessing steps together, mainly:
# 1) Lowercasing text field
# 2) Removing punctuation
# 3) Replacing newline characters with spaces
# 4) Removing numbers
df['text_cleaned'] = df['text'] \
    .str.lower() \
    .str.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) \
    .str.replace('\n', ' ') \
    .str.replace('\d+', '')
    
# Remove stopwords from the cleaned up text field
stop_words = stopwords.words('english')
df['text_cleaned'] = df['text_cleaned'].apply(
    lambda row: ' '.join([word for word in row.split() if word not in stop_words])
)


  


In [20]:
# Lemmatize words in the text field. This lemmatizing
# step isn't perfect because I am not determining the
# POS (part-of-speech) tag so by default, the lemmatizer
# assumes each word is a noun and tries to find the lemma
# for that form of the word.
lemmatizer = WordNetLemmatizer()
df['text_cleaned'] = df['text_cleaned'].apply(
    lambda row: ' '.join([lemmatizer.lemmatize(word) for word in row.split()])
)

In [21]:
print(f'=====TEXT BEFORE PROCESSING===== \n"{df["text"][0]}"')
print(f'=====TEXT AFTER PROCESSING===== \n"{df["text_cleaned"][0]}"')

=====TEXT BEFORE PROCESSING===== 
"Part of my goal in writing A Promised Land was to share my experience as President in promoting and advancing our shared international goals like increased freedom, human rights, and a mutual understanding of one another. 
        For a long time, in many parts of the world, we’ve been on a trajectory of progress in these areas. 
        But we’ve got to realize that democracy, better educational attainment, and rising prospects, especially for women—these things are not predestined. 
        So I also wanted the book to be a call to all those who say that these are values and ideals worth fighting for; a reminder of what it takes to protect and further the gains we’ve made around the world. 
        #APromisedLand"
=====TEXT AFTER PROCESSING===== 
"part goal writing promised land share experience president promoting advancing shared international goal like increased freedom human right mutual understanding one another long time many part world we’ve 

# Bag-of-Words

In [22]:
vectorizer = CountVectorizer()
text_bow = vectorizer.fit_transform(df['text_cleaned'])
text_bow_dense = pd.DataFrame(text_bow.todense(), columns = vectorizer.get_feature_names())
text_bow_dense.sample(5)



Unnamed: 0,absent,access,across,act,action,active,actually,administration,admiral,advancing,advisor,affected,afford,affordable,afternoon,ahead,air,aligns,alliance,along,already,also,always,america,american,analyzing,and,andrés,ann,announced,another,answer,anticipating,anyone,apromisedland,area,argument,around,artist,as,...,wear,week,weighing,well,west,whether,white,who,whose,widely,willing,willingness,win,window,winfrey,within,without,wobbly,woman,women,wonder,work,worked,worker,working,workshop,world,worry,worst,worth,would,writing,wrote,year,yet,you,young,youtu,yt,zone
14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,1,0,2,0,0,0,0,0,0,1,0,1,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,...,0,1,0,0,0,2,0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0,2,3,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0,0,1,0,1,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
8,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [23]:
vectorizer.get_feature_names()
term_counts = pd.DataFrame({'term' : vectorizer.get_feature_names(),
                            'counts': text_bow.toarray().sum(axis=0)})
term_counts.sort_values('counts', ascending = False).head()



Unnamed: 0,term,counts
525,president,14
770,we,14
422,make,13
797,world,12
743,ve,11


In [28]:
term_counts = term_counts.sort_values('counts', ascending = False)
term_counts[term_counts['counts'] >= 10]

Unnamed: 0,term,counts
525,president,14
770,we,14
422,make,13
797,world,12
743,ve,11
755,vote,11
287,got,11
500,people,10
24,american,10
34,apromisedland,10


# Sentiment Analyzer

## Label - positive/ negative/ neutral

In [39]:
positive_terms = [
    'advantage',
    'awesome',
    'amazing',
    'amaze',
    'best',
    'bountiful',
    'beautiful',
    'beauty',
    'breathtaking',
    'cool',
    'calm',
    'confident',
    'dashing',
    'delicious',
    'decadent',
    'diverse',
    'dope',
    'excellent',
    'enjoy'
    'fantastic',
    'friendly',
    'fun',
    'good',
    'great',
    'glad',
    'happy'
    'impress',
    'impressed',
    'joy',
    'like',
    'love',
    'loved',
    'nice',
    'open',
    'pretty',
    'positive',
    'positively',
    'professional',
    'wonderful'
    ]

negative_terms =[
    'awful',
    'atrocious',
    'angry',
    'beware',
    'bad',
    'broke',
    'broken',
    'boo',
    'bore',
    'boring',
    'crazy',
    'craze',
    'cantankerous',
    'cranky',
    'complaint'
    'doom',
    'darn',
    'death',
    'drat',
    'dislike',
    'disappointed',
    'disadvantage',
    'ew',
    'fail',
    'fault',
    'faulty',
    'hate',
    'horrible', 
    'icky',
    'idiot',
    'ill'
    'incompetent',
    'late',
    'mad',
    'mean',
    'mistake',
    'negative',
    'pain',
    'rot',
    'rotten',
    'rude',
    'sad'
    'suck',
    'stink'
    'slow',
    'stuck',
    'struggle',
    'terrible',
    'tired',
    'unprofessional',
    'unhappy'
    'wait',
    'waste',
    'worst',
    ]

# Analyze Function


In [40]:
def simple_sentiment_analyzer(positive_words, negative_words, data):
    '''Analyze a bag-of-words representation of texts for sentiment.

    This function simply uses counts of positive and negative words
    to determine if a particular text field is positive, negative, 
    of neutral.

    Parameters
    ----------
    positive_words : list[str]
        List of words that are considered positive.

    negative_words : list[str]
        List of words that are considered negative.

    data : pandas DataFrame
        A DataFrame containing the bag-of-words representation
        of a corpus.
    '''
    # Sum up all the postive word counts if they exist
    positive_counts = data[data.columns.intersection(positive_words)].sum(axis = 1)

    # Sum up all the negative word counts if they exist
    negative_counts = data[data.columns.intersection(negative_words)].sum(axis = 1)

    # Determine the final label based on the total score.
    total_score = positive_counts - negative_counts
    final_label = np.where(
        total_score > 0, 
        'POSITIVE', 
        np.where(total_score == 0, 'NEUTRAL', 'NEGATIVE') 
    )

    return final_label

In [41]:
positive_words = positive_terms
negative_words = negative_terms

text_bow_dense['sentiment'] = simple_sentiment_analyzer(
    positive_words,
    negative_words,
    text_bow_dense
)
text_bow_dense['sentiment'].value_counts()

POSITIVE    13
NEUTRAL     11
NEGATIVE     2
Name: sentiment, dtype: int64

# Analysis

In [42]:
random.seed(123)

data_analyzed = df.join(text_bow_dense[['sentiment']])
positive_sample = data_analyzed[data_analyzed['sentiment'] == 'POSITIVE'].sample(1)
negative_sample = data_analyzed[data_analyzed['sentiment'] == 'NEGATIVE'].sample(1)
neutral_sample = data_analyzed[data_analyzed['sentiment'] == 'NEUTRAL'].sample(1)

print(f'=====POSITIVE SAMPLE===== \n"{positive_sample["text"].values[0]}"\n=========================\n')
print(f'=====NEGATIVE SAMPLE===== \n"{negative_sample["text"].values[0]}"\n=========================\n')
print(f'=====NEUTRAL  SAMPLE===== \n"{neutral_sample["text"].values[0]}"\n=========================\n')

=====POSITIVE SAMPLE===== 
"For eight years in the White House, I walked along the West Colonnade to get to and from the Oval Office—a one-minute, open-air commute. It was along this walkway that I’d gather my thoughts for the day, preparing for conversations with members of Congress and constituents, reviewing plans and proposals to move the country forward. On the way back to the residence in the evenings, my briefcase stuffed with papers, I’d use the time to clear my mind, anticipating my dinner with Michelle and the girls, and an exuberant greeting from the dogs. This walk framed the beginning and end of my days, and it’s also the start of my book. Over the next couple of days, I’ll be sharing a glimpse of some of the most impactful moments of my life and the early years of my presidency. I hope you’ll travel the path with me. #APromisedLand"

=====NEGATIVE SAMPLE===== 
"On May 1, 2011, a team of Navy SEALs embarked on a top-secret operation to raid a compound in Pakistan where we 