Author: Felix Di Nezza IT DBA

DATA SOURCES <br>

Reddit Climate Change <br>
Author: Lexyr <br>
Source: https://www.kaggle.com/datasets/pavellexyr/the-reddit-climate-change-dataset <br>
Data collected using [SocialGrep Exports](https://socialgrep.com/exports) <br>

Twitter data <br>
Author: DEEPSENSE <br>
source: provided during challenge <br>

LICENCE <br>
Attribution 4.0 International (CC BY 4.0) <br>
https://creativecommons.org/licenses/by/4.0/ <br>

CHANGES AND USAGE <br>
Dataset used for sentiment analysis

REFERENCES

Sentiment analysis on twitter <br>
https://medium.com/mlearning-ai/elon-musks-twitter-sentiment-analysis-with-transformers-hugging-face-roberta-49b9e61b1433 <br>

Tutorial nltk & roberta <br>
https://www.youtube.com/watch?v=QpzMWQvxXWk <br>

Python regex functions <br>
https://pynative.com/python-regex-compile/ <br>

Word cloud tutorial <br>
https://medium.com/mcd-unison/create-word-cloud-scraping-data-from-reddit-api-using-praw-and-spacy-b5c9c61c2d10 <br>


In [None]:
## additional installs
#!pip install emoji
#!pip install emot

In [None]:
import collections
import csv
import numpy as np
import seaborn as sns
import pandas as pd
import re
import emoji
import pathlib
import tqdm
import string

# plot
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# nltk 
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

# progress bar
from tqdm.notebook import tqdm

#emoji filter
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO

In [None]:
# import csv
with open ( './the-reddit-climate-change-dataset-comments.csv',
            'r',
            encoding="utf8",
            newline='\n'
          ) as source_csv: 
            reader = csv.reader(source_csv)
            df = pd.read_csv(source_csv)

In [None]:
# print size of the dataframe
print(df.shape)

# return first row 
df['body'].values[0]


In [None]:
#check dataframe information
df.info()

In [None]:
# get first 5 rows to check content structure
df.head()

In [None]:
# multiparameter search to check for number of results returned
# dataframe[column to visualize][search based on column]

df[['body', 'subreddit.name']][
           (
           df['body'].str.contains('micro-plastic', na = False) |
           df['body'].str.contains('microplastic', na = False) |
           df['body'].str.contains('Microplastic', na = False) |
           df['body'].str.contains('Micro-plastic', na = False)
           ) &
           (
           df['body'].str.contains('ocean', na = False) |
           df['body'].str.contains('lake', na = False) |
           df['body'].str.contains('water', na = False) |
           df['body'].str.contains('Ocean', na = False) |
           df['body'].str.contains('Lake', na = False) |
           df['body'].str.contains('Water', na = False)
           )
          ].count()

In [None]:
#test retrived data
print(df[['subreddit.name','body']].values[11930])

In [None]:
# extract subset with key words
# later test improvement with regex
sub_kw = df[['id','subreddit.name','body']][
           (
           df['body'].str.contains('micro-plastic', na = False) |
           df['body'].str.contains('microplastic', na = False) |
           df['body'].str.contains('Microplastic', na = False) |
           df['body'].str.contains('Micro-plastic', na = False)
           ) &
           (
           df['body'].str.contains('ocean', na = False) |
           df['body'].str.contains('lake', na = False) |
           df['body'].str.contains('water', na = False) |
           df['body'].str.contains('Ocean', na = False) |
           df['body'].str.contains('Lake', na = False) |
           df['body'].str.contains('Water', na = False)
           )
          ]


In [None]:
# check content sub-search
sub_kw[['subreddit.name', 'body']].values[129]

In [None]:

# Data cleansing functions
# must optimize

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

# remove links
def remove_links(text):
    '''Takes a string and removes web links from it'''
    text = re.sub(r'http\S+', '', text) # remove http links
    text = re.sub(r'bit.ly/\S+', '', text) # remove bitly links
    text = text.strip('[link]') # remove [links]
    return text

#remove HTML
def clean_html(text):
    html = re.sub('&lt;/?[a-z]+&gt;', '', text)
    html = re.sub('&ft;/?[a-z]+&gt;', '', text)
    html = re.compile('<.*?>')#regex
    return html.sub(r'',text)


#remove special special characters
def rem_spec_c(text):
    text = re.sub('([_]+)', "", text)
    return text

# remove punctuation
def clean_symb(text):
    text = re.sub(r'[^\w\s]',' ',text) # remove all except letters and spaces
    return text

# grab hashtags
def hashtags(text):
    hash = re.findall(r"#(\w+)", text)
    return hash

# remove hashtags
def rem_hashtags(text):
    text = re.sub(r"#(\w+)", '', text)
    return text

# remove reddit usernames
def remove_users(text):
    '''Takes a string and removes u/user_name'''
    text = re.sub('(u/[A-Za-z]+[A-Za-z0-9-_]+)', '', text) 
    return text

# remove twitter user
def rem_usr_twt(text):
    '''Takes a string and removes u/user_name'''
    text = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', text) 
    return text

# translate emoji
def emoji_conv(text):
    for emot in UNICODE_EMOJI:
        if text == None:
            text = text
        else:
            text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",", "").replace(":", "").split()))
    return text

# remove non ascii character
def non_ascii(s):
    return "".join(i for i in s if ord(i)<128)

# turn all in low char
def lower(text):
    return text.lower()


# remove emoji
def emoji_remove(text):
    for emot in UNICODE_EMOJI:
        if text == None:
            text = text
        else:
            text = text.replace(emot, ' ')
    return text



In [None]:
# test 1
# # run polarity score on the entire dataset
# res = {} # this is a dicitonary
# for i, row in tqdm(df.iterrows(), total=len(df)):
#     text = row['body']
#     myid = row['id']
#     res[myid] = sia.polarity_scores(text)
#     break
    

In [None]:
# check first 5 rows from subset
sub_kw.head()

In [None]:
# prepare sentiment analyzer
sia = SentimentIntensityAnalyzer()

In [None]:
# cleansing for rumor
for i, row in tqdm (sub_kw.iterrows(), total=len(sub_kw)):
    text = row['body']
    # cleansing
    text = lower(text)
    text = remove_links(text)
    text = clean_html(text)
    text = emoji_remove(text)
    row['body'] = text
    myid = row['id']

In [None]:
# return first row to test
# print(sub_kw.values[1])

In [None]:
# run polarity score on the subset with keywords
res = {} # this is a dictionary
for i, row in tqdm (sub_kw.iterrows(), total=len(sub_kw)):
    text = row['body']
    myid = row['id']
    res[myid] = sia.polarity_scores(text)

In [None]:
# store result diciotnary in a pandas dataframe the T will flip it
vaders = pd.DataFrame(res).T
# reset and rename index
vaders = vaders.reset_index().rename(columns={'index': 'id'})
vaders.head()



In [None]:
# merge the new calcualted index to the subset previously filtered with a left merge
vaders = vaders.merge(sub_kw, how='left') 
#vaders.head

In [None]:
# return first row of the header column to test
print(vaders.values[1])

In [None]:
vaders.head(3)

In [None]:
round(vaders['compound'].mean(),2)

In [None]:
# add positive negative neutral association to allow group by
conv = {}
for i, row in tqdm (vaders.iterrows(), total=len(vaders)):
    value = row['compound']
    if value < 0 :
        exp = 'negative'
    if value > 0 :
        exp = 'positive'
    if value == 0 :
        exp = 'neutral'
    myid = row['id']
    conv[i] = myid, exp

In [None]:
# debug output
# conv

In [None]:
# turn it to a dataframe
results = pd.DataFrame(conv)

# verticalized frame
results = pd.DataFrame(conv).T

# name columns
results.columns=['id', 'type']




In [None]:
# test rotate output with renamed columns
results.head()

In [None]:
#attach the dataframe to the vaders results

results = results.merge(vaders, how='left')

In [None]:
# check new dataframe
results.head()

In [None]:
from matplotlib.colors import ListedColormap
# graph results 
cmap = ListedColormap(['#e50000', '#ffff14','#0343df']) # pie
#cmap = ListedColormap(['#0343df']) #bar
results['type'].value_counts().sort_index().plot(
                                            kind='pie',
                                            title='Sentiment #Microplastic',
                                            ylabel ='',
                                            colormap= cmap
                                            )

REDDIT WORD CLOUD

In [None]:
# !pip install wordcloud
# !pip install spacy
# !pip install PIL
!python -m spacy download en_core_web_sm

from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import spacy


In [None]:
#replace keyword with space
def rem_kw (text, kw):
    text = text.replace(kw, ' ')
    return text

test = ' &gthello&gt '
print(rem_kw(test, '&gt'))

In [None]:
# clean and parse
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2409706 # 1gb ram every 1000000
words = '\n'.join(sub_kw.body)
words = lower(words)
words = rem_kw(words, '&gt')
words = remove_links(words)
words = clean_html(words)
words = remove_users(words)
words = emoji_remove(words)
words = clean_symb(words)
words = rem_spec_c(words)
text = nlp(words)

In [None]:
cloud = ""
for word in text:    
    if word.pos_ in ['ADJ','NOUN','PROPN']:
        cloud = " ".join((cloud, word.text.lower()))

In [None]:
wordcloud = WordCloud(stopwords=STOPWORDS,
                      max_words=100,
                      background_color='white',
                      width=800,
                      height=300).generate(cloud)

plt.imshow(wordcloud,
           interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
#print (cloud)

TWITTER WORD CLOUD



In [None]:
# open the csv and turn it into a dataframe
with open ('./plastic_pollution.csv',
'r',
encoding="utf8",
newline='\n') as source_csv:
    reader = csv.reader(source_csv)

    tdf = pd.read_csv(source_csv)

In [None]:
nlp = spacy.load("en_core_web_sm")
words = '\n'.join(tdf.description)
words = lower(words)
words = remove_links(words)
words = clean_html(words)
words = rem_kw(words, '&gt')
words = rem_hashtags(words)
words = rem_usr_twt(words)
words = emoji_remove(words)
words = clean_symb(words)
words = rem_spec_c(words)
text = nlp(words)

In [None]:
cloud2 = ""
for word in text:    
    if word.pos_ in ['ADJ','NOUN','PROPN']:
        cloud2 = " ".join((cloud2, word.text.lower()))

In [None]:
wordcloud = WordCloud(stopwords=STOPWORDS,
                          max_words=100,
                          background_color='white',
                          width=800, height=300).generate(cloud2)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# debug 
# print (cloud2)