In [None]:
# Running Tweets Sentiment Analysis on All Text

In [None]:
# This is an edited version of the prior plan -- I can't get rid of similar lines using fuzz 
# Assuming that private tweets (removing those with url) do not overlap, the data does not incude retweets anyway

In [None]:
# Tweets to Dict -- Retaining Relevant Tweets

def tweets_to_dict(tweet):
    """
    For each dictionary in the output, create a new dictionary that includes
    information on name, username, timestamp, text, the url shared by the user, the
    title of the url shared, and a boolean indicator showing whether or not the
    tweet included a url

    Parameters
    tweet: a dictionary

    Returns
    -------
    dictionary
    """
    # keys: username, url, time, title
    tweets_dict = {}
    # creates {'username':'', ..}
    tweets_dict['fullname'] = tweet['fullname']
    tweets_dict['user'] = tweet['user']
    tweets_dict['timestamp'] = tweet['timestamp']
    tweets_dict['text'] = tweet['text']
    tweets_dict['url_dum'] = False
    tweets_dict['url'] = None
    tweets_dict['title'] = None
    tweets_dict['url_source'] = None
    # for the url
    html_text = tweet['html']
    f_str = "data-expanded-url"
    # for the title
    text = tweet['text']
    # get the titles
    if f_str in html_text:
        tweets_dict['url_dum'] = True
    return tweets_dict

def CSVtoDict(filename):
    '''
    Read CSV file yield a row as dictionary in a list

    Parameters
    ---------
    filename: a string

    Returns
    -------
    a list
    '''
    import csv
    dict_list = []
    with open(filename) as f:
        reader = csv.DictReader(f)
        for line in reader:
            dict_list.append(tweets_to_dict(dict(line)))
    return(dict_list)

In [None]:
# Code that Cleans Corpus According to Time Frame

def ConvertDates(string_date):
    '''
    Convert string dates ("Y-M-D") to datetime object

    Parameters
    ----------
    string_date: a string

    Returns
    -------
    datetime object
    '''
    import datetime
    string_date = string_date[0:10]
    string_date = string_date.replace("-","")
    string_date = string_date.strip()
    date = datetime.datetime.strptime(string_date,"%Y%m%d").date()
    return date

def ChangeDicts(data):
    '''
    Convert string dates ("Y-M-D") to datetime object

    Parameters
    ----------
    string_date: a string

    Returns
    -------
    datetime object
    '''
    import datetime
    for d in data:
        d['date_conv'] = ConvertDates(d['timestamp'])
    # remove those with url
    changed = [i for i in data if not (i['url_dum'] == True)]
    return changed


# Create String Before Conducting Word Count

def PreCorpusList(data,key,s_date,e_date):
    """
    Remove tweets including crisis key words
    Remove tweets including urls

    String join all text under time condition

    Parameters
    ----------
    data : a list of dictionaries
    key: the key in the dictionary within the list ('text' or 'title')
    s_date: start date, in string "Y-M-D"
    e_date: end date, in string "Y-M-D"


    Returns
    -------
    a joined string of all tweets
    """
    import re
    import random
    # converte the dates to datetime format
    data = ChangeDicts(data)
    # parse according to date & remove tweets with url
    text_list = []
    for d in data:
        if (d['date_conv'] >= ConvertDates(s_date)) & (d['date_conv'] <= ConvertDates(e_date)) & (d['url_dum'] is False):
            text_list.append(d[key])
    # strings in text_list
    # remove if the text is the same
    text_list2 = [i for n, i in enumerate(text_list) if i not in text_list[n + 1:]]
    # Remove words related to THAAD
    fp_words = ['THAAD','thaad','사드','싸드','#사드','#싸드','#THAAD','#Thaad']
    pattern = re.compile(r"|".join(fp_words))
    removed = [s for s in text_list2 if not pattern.search(s)]
    all_text = ' '.join(removed)
    return(all_text)


def CorpusListbyHandle(data,s_date,e_date):
    """
    Remove tweets including crisis key words
    Remove tweets including urls

    Parameters
    ----------
    data : a list of dictionaries
    s_date: start date, in string "Y-M-D"
    e_date: end date, in string "Y-M-D"
    k: the Token_Set_Ratio

    Returns
    -------
    a list of dictionaries by handle ex: [{handle:joined_string},{handle:joined_string}]
    """
    import re
    import random
    # convert the dates to datetime format
    data = ChangeDicts(data)
    # parse according to date & remove tweets with url
    alist = []
    for d in data:
        if (d['date_conv'] >= ConvertDates(s_date)) & (d['date_conv'] <= ConvertDates(e_date)) & (d['url_dum'] is False) & (d['text'] != 'url_source'):
            alist.append(d)
    import re
    fp_words = ['THAAD','thaad','사드','싸드','#사드','#싸드','#THAAD','#Thaad']
    pattern = re.compile(r"|".join(fp_words))
    output = [x for x in alist if not pattern.search(x['text'])]
    # remove if 'text' is null:
    output = [x for x in output if not x['text'] == '']
    # remove duplicates
    check_val  = set()
    final_output = []
    for d in output:
        if d['text'] not in check_val:
            final_output.append(d)
            check_val.add(d['text'])
    return(final_output)


class TweetsCorpus:
    '''
    Functions
    ---------
    __init__: returns a string to be cleaned
    text_cleaning: returns a list of characters
    getPreWordCorpus: join the results of text_cleaning
    getWordCorpus: returns a string
    getNouns: returns a list of nouns using Konlpy module
    '''
    def __init__(self, text):
        self.text = text
    def text_cleaning(self):
        import re
        result_list = []
        for item in self.text:
            cleaned_text = re.sub('[a-zA-Z]', '', item)
            cleaned_text = re.sub('\n', ' ', cleaned_text)
            cleaned_text = re.sub('[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"...]',
                              '', cleaned_text)
            result_list.append(cleaned_text)
        return result_list
    def getPreWordCorpus(self):
        result=self.text_cleaning()
        doc = (''.join(result))
        return doc
    def getWordCorpus(self):
        doc = self.getPreWordCorpus()
        text = ''.join(c for c in doc if c.isalnum() or c in '+, ')
        text = ''.join([i for i in text if not i.isdigit()])
        return text
    def getPhrases(self):
        from konlpy.tag import Okt
        phrases_tagger = Okt()
        word_corpus = self.getWordCorpus()
        phrases = phrases_tagger.phrases(word_corpus)
        return phrases
    def getNouns(self):
        from konlpy.tag import Okt
        from collections import Counter
        nouns_tagger = Okt()
        word_corpus = self.getWordCorpus()
        nouns = nouns_tagger.nouns(word_corpus)
        return nouns

In [None]:
# Read the CSV File, Split to Pre and Post

# fn = '20150101_china_tweets_20170731.csv'
# dt = CSVtoDict(fn)

In [None]:
# Tuesday August 20 -- just create pandas using user column, tweet column, and the cleaned tweet using the class above

# then follow the instructions to run LDA on the individual tweets

In [None]:
# packages to store and manipulate data
# import pandas as pd
# import numpy as np

# # plotting packages
# import matplotlib.pyplot as plt
# import seaborn as sns

# # model building package
# import sklearn

In [None]:
# get the Corpus List by Handle as pd.data frame:
# CorpusListbyHandle(data,s_date,e_date)
# Step 1: clean the texts in corpus list by handle:


In [None]:
# sentiment analyses

# print(getSentiScore(TweetsCorpus(pre_joined_raw_string)),getSentiScore(TweetsCorpus(post_joined_raw_string)))

In [None]:
# run LDAs on the saved json files

In [None]:
# for stm -- create files based on the writer
# list of dictionaries by author: [{author; joined_strings},]