In [None]:
'''
Florida International Univeristy - Data Science MS
CAP 5640 - NLP - Spring 2019
Andrea Garcia and Constanza Schubert

JSON files to Python Dataframe
'''

In [216]:
#Load libraries
import pandas as pd
import json
import numpy as np
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk import word_tokenize

In [2]:
# STEP 1: READ FILE AND DELETE BLANK LINES

def deleteblanks(filetoedit, newfilename):
    #Accepts two .json files as input, original file name (needs cleaning) and new file name (function will create new  .json clean file under this name) 
    newfile = open(newfilename, 'w')
    with open(filetoedit, 'r') as f:
        print ("".join(fileline for fileline in f if not fileline.isspace()), file=newfile)

    return

In [3]:
# STEP 2: CREATE NESTED DICTIONARY

def createdict(filename):
    #Accepts a .json file as input and converts data into a nested dictionary. File must not contain any empty lines before, throughout or after main body.
    with open(filename, "r") as jsondata:
        tweetlines = []
        for tline in jsondata:
            tweetlines.append(tline)

    tweetdict = {}
    i = 0
    while i < len(tweetlines):
        tweetdict[i + 1] = json.loads(tweetlines[i])
        i += 1
    return tweetdict


In [115]:
#fix function to expand out user, extended_tweet, retweet_status, and quoted status columns
#for user I need user_id and for extended_tweet I need full_text
#for retweets: retweet_status -> full_text

# STEP 5: CREATE DATAFRAME FROM DICTIONARY

def createdtframe1(tweetdict):
    #Accepts python dictionary as input and converts to dataframe using keys as columns and values as rows
    #Columns comprised of only top level keys
    df = pd.DataFrame.from_dict(tweetdict, orient='index')
    df = df.join(pd.DataFrame(df['extended_tweet'].to_dict()).T,rsuffix='_extended')
    df = df.join(pd.DataFrame(df['retweeted_status'].to_dict()).T,rsuffix='_retweeted')
    df = df.join(pd.DataFrame(df['quoted_status'].to_dict()).T,rsuffix='_quoted')
    #Get full_text of retweet
    df = df.join(pd.DataFrame(df['extended_tweet_retweeted'].to_dict()).T[['full_text']],rsuffix='_retweeted')
    keep_columns= ['created_at',
                    'id_str',
                    'text',
                    'truncated',
                    'in_reply_to_status_id_str',
                    'is_quote_status',
                    'retweeted',
                    'quoted_status_id_str',
                    'quoted_status',
                    'full_text',
                    'id_str_retweeted',
                    'in_reply_to_status_id_str_retweeted',
                    'is_quote_status_retweeted',
                    'quoted_status_retweeted',
                    'quoted_status_id_str_retweeted',
                    'retweeted_retweeted',
                    'text_retweeted',
                    'truncated_retweeted',
                    'extended_entities_quoted',
                    'extended_tweet_quoted',
                    'id_str_quoted',
                    'in_reply_to_status_id_str_quoted',
                    'is_quote_status_quoted',
                    'quoted_status_id_str_quoted',
                    'retweeted_quoted',
                    'text_quoted',
                    'truncated_quoted',
                  'full_text_retweeted']

    df = df[keep_columns]
    
    #For retweets, replace text value with text_retweeted, getting rid of the RT before the text
    #Also applies for retweets
    df['text']=np.where(df['id_str_retweeted'].isnull(),df['text'],df['text_retweeted'])

    
    
    #Expand out user, extended_tweet, retweeted_status, and quoted_status
    #pd.concat([df.drop('user',axis=1),df['user'].apply(pd.Series)],axis=1)
    #df = pd.concat([df.drop('extended_tweet',axis=1),df['extended_tweet'].apply(pd.Series)],axis=1)
    #pd.concat([df.drop('retweeted_status',axis=1),df['retweeted_status'].apply(pd.Series)],axis=1)
    #pd.concat([df.drop('quoted_status',axis=1),df['quoted_status'].apply(pd.Series)],axis=1)
    return df

In [116]:
#Process Apple and Samsung JSON files using helper functions
apple_dict = createdict('.\Data\clean_apple.json')
samsung_dict = createdict('.\Data\clean_apple.json')
apple_df=createdtframe1(apple_dict)
samsung_df=createdtframe1(samsung_dict)

In [229]:
apple_df.head()

Unnamed: 0,created_at,id_str,text,truncated,in_reply_to_status_id_str,is_quote_status,retweeted,quoted_status_id_str,quoted_status,full_text,...,extended_entities_quoted,extended_tweet_quoted,id_str_quoted,in_reply_to_status_id_str_quoted,is_quote_status_quoted,quoted_status_id_str_quoted,retweeted_quoted,text_quoted,truncated_quoted,full_text_retweeted
1,Sun Jan 27 19:24:51 +0000 2019,1089605170252705799,RT @A7laFe: Ok guys its time for Dimonds avail...,False,,False,False,,,,...,,,,,,,,,,Ok guys its time for Dimonds available now in ...
2,Sun Jan 27 19:24:58 +0000 2019,1089605199587618817,It makes me chuckle when articles claim that t...,True,,False,False,,,It makes me chuckle when articles claim that t...,...,,,,,,,,,,
3,Sun Jan 27 19:25:01 +0000 2019,1089605211864399874,This was pretty cool! Thank you @apple for hav...,True,,False,False,,,This was pretty cool! Thank you @apple for hav...,...,,,,,,,,,,
4,Sun Jan 27 19:25:06 +0000 2019,1089605236812103680,@BulletinAtomic @POTUS @DAVOS @WEF @ENERGY @YE...,True,1.0886793617738916e+18,False,False,,,@BulletinAtomic @POTUS @DAVOS @WEF @ENERGY @YE...,...,,,,,,,,,,
5,Sun Jan 27 19:25:26 +0000 2019,1089605318366191616,I’m pretty sure I just discovered that @Family...,True,,False,False,,,I’m pretty sure I just discovered that @Family...,...,,,,,,,,,,


In [None]:
#Retweets
# Retweets should be discarded only if original tweet is in corpus.

#Search for retweet as original tweet in corpus
def filter_retweets(df):
    original_tweet=list(df[df['id_str_retweeted'].notnull()]['id_str_retweeted'])
    #if original tweet is in corpus, discard retweet by id_str_retweeted
    retweets_discard = list(df[df['id_str'].isin(original_tweet)]['id_str'])
    df = df[~df['id_str_retweeted'].isin(retweets_discard)]
    
    return df


In [None]:
#Need to check whether a record is an extended tweeet and if so, replace extended tweet on text field

def extract_extended_tweet(df):
    #Also applies for retweets
    df['text']=np.where(df['full_text'].isnull(),df['text'],df['full_text'])
    return df

In [197]:
#Text preprocessing functions

def clean_html(text):
    soup = BeautifulSoup(text, 'html5lib')    
    souped = soup.get_text()
    return souped
# def extract_hashtags(s):
#     hashed = set(part[1:] for part in s.split() if part.startswith('#')) 
#     return len(hashed)   


def clean_tweet(tweet):
    #Utility function to clean the text in a tweet by removing links, twitter handles, and special characters using regex
    #keeps basic punctuation because you need it for dependency parsing
    #currently keeps hashtags separated by a space. should be separate hashtags like sentences?
    return re.sub("(@[A-Za-z0-9_]+)|([^0-9A-Za-z\.,:;!?'$# \t])|(\w+:\/\/\S+)", " ", tweet)
    #return ' '.join(re.sub("(@[A-Za-z0-9_]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

#Emoticon handling
def remove_emojis(tweet):
    emoji_pattern = re.compile("["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'',tweet)

def handle_hashtags(tweet):
    #separate hashtag terms with semi-colons
    return tweet

#need function to remove extra whitespace: all tabs, newlines, and other whitespace-like characters
def remove_whitespace(tweet):
    return re.sub('\s+',' ',tweet).strip()

In [None]:
#Tokenization

def tokenize(df):
    df['tokenized_tweets'] = df.apply(lambda row: word_tokenize(row['sentences']), axis=1)
    #lowercase
    df['tokenized_tweets'] = df['tokenized_tweets'].apply(lambda x: [item.lower() for item in x])
    return df

#Removal of stop words
def remove_stopwords(df):
    stops = set(stopwords.words("english"))
    df['tweet_no_stop']=df['tokenized_tweets'].apply(lambda x: [item for item in x if item not in stops])
    return df


In [None]:
#Apply text preprocessing functions

def preprocess_text(df):
    df=filter_retweets(df)
    df=extract_extended_tweet(df)
    df['text'] = df['text'].apply(clean_html)
    df['text']=df['text'].apply(remove_emojis)
    df['text'] = df['text'].apply(clean_tweet)
    df['text']=df['text'].apply(handle_hashtags)
    df['text']=df['text'].apply(remove_whitespace)
    return df
