# EDA

## Reading Files
* page_politician_info
* top_1000_page
* politician_pages

## Data Cleaning
* top_1000_page

## Merging Tables


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
%matplotlib inline

In [2]:
import os
os.chdir('/home3/r09725056/Desktop/analysis-ChingYaoL')
print(os.getcwd())
# Use relative paths in the code

/home3/r09725056/Desktop/analysis-ChingYaoL


In [3]:
for file in os.listdir():
    print(file, end=' ')

input code output temp .git README.md README_about_USFB_Data.pdf 

#### Suggested workflow from README
* Use relative paths in the code
* Read data from _input_
* Export generated tables or figures to _output_
* Read/Write other temporary files from _temp_

In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # 讓 DataFrame Output可以重複疊起來
pd.set_option('display.max_columns', None) # show all columns

In [5]:
from datetime import datetime
from pandas_profiling import ProfileReport

In [6]:
import contractions 
import nltk
import string
import fasttext
import contractions # resolving contractions and slangs, e.g. "yall're happy now" --> "you all are happy now"
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

* [PyPI for fasttext](https://pypi.org/project/fasttext/)
* [medium for fasttext](https://medium.com/@c.chaitanya/language-identification-in-python-using-fasttext-60359dc30ed0)
* [GitHub for contractions](https://github.com/kootenpv/contractions)

## 2016 US Presidential Election: Nov 8, 2016 (Tue)

In [7]:
# Page
page_info = pd.read_csv(r'input/page/1000-page-info.csv')
politician_info = pd.read_csv(r'input/page/politician-info.csv')
page_politician_info = pd.read_csv(r'input/page/1000-page-and-politician-info.csv')

# Post
top_1000_pages = pd.read_csv(r'input/post/1000-page/2015-01-01-to-2017-04-08.csv', parse_dates=['post_created_time_CT', 'post_updated_time_CT'])
politician_pages = pd.read_csv(r'input/post/politician/2015-01-01-to-2016-11-30.csv')

In [None]:
# Reaction (LIKE, LOVE, HAHA, WOW, SAD, ANGRY, THANKFUL)PresidentialPresidential
## 1. Reactions on 1000-page
#### A. Every-20-minutes (2016-09-29 ~ 2016-11-21)
date_parser = lambda unixTime: pd.to_datetime(unixTime, unit='s')
every20min_like = pd.read_csv(r'input/reaction/1000-page/20-min/by-reaction-type/LIKE.csv', parse_dates=['reaction_time'], date_parser=date_parser)
every20min_love = pd.read_csv(r'input/reaction/1000-page/20-min/by-reaction-type/LOVE.csv', parse_dates=['reaction_time'], date_parser=date_parser)
every20min_haha = pd.read_csv(r'input/reaction/1000-page/20-min/by-reaction-type/HAHA.csv', parse_dates=['reaction_time'], date_parser=date_parser)
every20min_wow = pd.read_csv(r'input/reaction/1000-page/20-min/by-reaction-type/WOW.csv', parse_dates=['reaction_time'], date_parser=date_parser)
every20min_sad = pd.read_csv(r'input/reaction/1000-page/20-min/by-reaction-type/SAD.csv', parse_dates=['reaction_time'], date_parser=date_parser)
every20min_angry = pd.read_csv(r'input/reaction/1000-page/20-min/by-reaction-type/ANGRY.csv', parse_dates=['reaction_time'], date_parser=date_parser)
every20min_thankful = pd.read_csv(r'input/reaction/1000-page/20-min/by-reaction-type/THANKFUL.csv', parse_dates=['reaction_time'], date_parser=date_parser)

#### B. LIKE by US political users (2015-01-01 ~ 2016-11-30)
# pd.read_csv(r'input/reaction/1000-page/2015-01-01-to-2016-11-30/us-political-user/by-reaction-type/LIKE/by-post-date/')
like_by_post_dates = []
for fileName in sorted(os.listdir(r'input/reaction/1000-page/2015-01-01-to-2016-11-30/us-political-user/by-reaction-type/LIKE/by-post-date')):
    if not fileName.startswith('.'): # Not a hidden file
        like_by_post_dates.append(fileName.split('.')[0])

LIKE_dfs = []
for post_date in like_by_post_dates:
    path = r'input/reaction/1000-page/2015-01-01-to-2016-11-30/us-political-user/by-reaction-type/LIKE/by-post-date/{}.csv'.format(post_date)
    temp_df = pd.read_csv(path)
    temp_df['post_date'] = datetime.strptime(post_date, "%Y-%m-%d") # Add a column specifying the date, which is (part of) the fileName
    LIKE_dfs.append(temp_df)
    
LIKE_on_1000_page = pd.concat(LIKE_dfs)

## 2. Reactions on Politician 
#### A. LIKE by US political users (2015-01-01 ~ 2016-11-30)


# Comment
## a total of 500 csv files
## NOTE: The files are LARGE! eg. 000000000000.csv, as DataFrame, has a shape of (2863013, 4) and memory usage of 87.4+ MB
tables = []
for fileName in os.listdir(r'input/comment/2015-01-01-to-2016-11-30'):
    tables.append(pd.read_csv(r'input/comment/2015-01-01-to-2016-11-30/{}'.format(fileName), parse_dates=["comment_created_time"]))
comments = pd.concat(tables)



In [None]:
page_info.shape
politician_info.shape
page_politician_info.shape
top_1000_pages.shape
politician_pages.shape

In [None]:
top_1000_pages.head(1)

In [None]:
# Get rid of unwanted columns
cols_to_drop = ['post_picture', 'post_link', 'post_created_time', 'post_updated_time']
top_1000_pages.drop(columns=cols_to_drop)

In [None]:
def func_to_nonstring(func):
    """
    A decorator that allows a function to bypass nonstring arguments. That is, func applies only to nonstrings.
    """
    def wrapper_func(x):
        try:
            return func(x)
        except IndexError:
            return x
        except AttributeError:
            return x
    return wrapper_func

@func_to_nonstring
def contract_message(x):
    return contractions.fix(x)

In [None]:
top_1000_pages['post_message'] = top_1000_pages['post_message'].apply(contract_message)

In [None]:
print(top_1000_pages['post_message'].isna().sum())
print(top_1000_pages['post_message'].notna().sum())
print(f"null rate: {top_1000_pages['post_message'].isna().sum() / top_1000_pages['post_message'].notna().sum():.6f} %")

### English Lanuage Detection
The reason I use fasttext: [Benchmarking Language Detection for NLP](https://towardsdatascience.com/benchmarking-language-detection-for-nlp-8250ea8b67c)

In [None]:
pretrained_model = "/home3/r09725056/.conda/envs/usfb/lib/python3.7/site-packages/fasttext/lid.176.bin"
model = fasttext.load_model(pretrained_model)

@func_to_nonstring
def predict_language(sent):
    sent = sent.replace('\n', ' ')
    pred = model.predict(sent) # model.predict() returns a tuple like this: (('__label__en',), array([0.95346403]))
    return pred[0][0].split('_')[-1]

In [None]:
top_1000_pages['language'] = top_1000_pages['post_message'].apply(predict_language)

In [None]:
top_1000_pages['language'].value_counts(dropna=False, normalize=True).round(6)

In [None]:
def filter_top_unique_elements(ser, thres=0.001, ndecimal=6, show_null_rate=True):
    """
    Return top unique elements which take up no less than 0.001 (or thres) in the entire pd.Series
    This is done through ser.value_counts(normalize=True, dropna=False)
    """
    filt = (ser.value_counts(dropna=False, normalize=True) >= thres)
    if show_null_rate:
        print(f"null rate = {ser.isnull().mean() * 100:.6f} %")
    return ser.value_counts(dropna=False).loc[filt].round(ndecimal)

# filter for languages that's at least 0.1%, then assign to top_languages
lang_thres = 0.001
top_languages = filter_top_unique_elements(top_1000_pages['language'], lang_thres)

In [None]:
# A simpler way to format yticks
# fig, ax = plt.subplots(figsize=(12, 8))
# f = mticker.ScalarFormatter(useOffset=False, useMathText=True)
# g = lambda x, pos : "${}$".format(f._formatSciNotation('%1.10e' % x))
# plt.gca().yaxis.set_major_formatter(mticker.FuncFormatter(g))
# top_languages.plot.bar()

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
# Customize Yticks
class MathTextSciFormatter(mticker.Formatter):
    def __init__(self, fmt="%1.2e"):
        self.fmt = fmt
    def __call__(self, x, pos=None):
        s = self.fmt % x
        decimal_point = '.'
        positive_sign = '+'
        tup = s.split('e')
        significand = tup[0].rstrip(decimal_point)
        sign = tup[1][0].replace(positive_sign, '')
        exponent = tup[1][1:].lstrip('0')
        if exponent:
            exponent = '10^{%s%s}' % (sign, exponent)
        if significand and exponent:
            s =  r'%s{\times}%s' % (significand, exponent)
        else:
            s =  r'%s%s' % (significand, exponent)
        return "${}$".format(s)

# Format with 2 decimal places
plt.gca().yaxis.set_major_formatter(MathTextSciFormatter("%1.2e"))
# Plot Top Languages
plt.title("Number of Samples for Top Languages", size=20)
plt.xticks(size=20)
plt.yticks(size=15)
top_languages.plot.bar();

In [None]:
# filter for languages that's at least 0.01%
filter_top_unique_elements(top_1000_pages['post_name'], thres=0.0001)

In [None]:
top_1000_pages['language'].value_counts(normalize=True, dropna=False).apply(lambda x: f"{x * 100:.6f}%")

In [None]:
top_1000_pages['post_type'].value_counts(dropna=False)

In [None]:
top_1000_pages.head()

In [None]:
filter_top_unique_elements(top_1000_pages['post_caption'], thres=0)
# It seems that captions are usually webpages or links

In [None]:
filter_top_unique_elements(top_1000_pages['post_description'], thres=0.0002)