In [428]:
'''
Florida International Univeristy - Data Science MS
CAP 5640 - NLP - Spring 2019
Andrea Garcia and Constanza Schubert

JSON files to Python Dataframe
'''

'\nFlorida International Univeristy - Data Science MS\nCAP 5640 - NLP - Spring 2019\nAndrea Garcia and Constanza Schubert\n\nJSON files to Python Dataframe\n'

In [429]:
#Load libraries
import pandas as pd
import json
import numpy as np
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk import word_tokenize

In [430]:
# STEP 1: READ FILE AND DELETE BLANK LINES

def deleteblanks(filetoedit, newfilename):
    #Accepts two .json files as input, original file name (needs cleaning) and new file name (function will create new  .json clean file under this name) 
    newfile = open(newfilename, 'w')
    with open(filetoedit, 'r') as f:
        print ("".join(fileline for fileline in f if not fileline.isspace()), file=newfile)

    return

In [431]:
# STEP 2: CREATE NESTED DICTIONARY

def createdict(filename):
    #Accepts a .json file as input and converts data into a nested dictionary. File must not contain any empty lines before, throughout or after main body.
    with open(filename, "r") as jsondata:
        tweetlines = []
        for tline in jsondata:
            tweetlines.append(tline)

    tweetdict = {}
    i = 0
    while i < len(tweetlines):
        tweetdict[i + 1] = json.loads(tweetlines[i])
        i += 1
    return tweetdict


In [462]:
#fix function to expand out user, extended_tweet, retweet_status, and quoted status columns
#for user I need user_id and for extended_tweet I need full_text
#for retweets: retweet_status -> full_text

# STEP 5: CREATE DATAFRAME FROM DICTIONARY

def createdtframe(tweetdict):
    #Accepts python dictionary as input and converts to dataframe using keys as columns and values as rows
    #Columns comprised of only top level keys
    df = pd.DataFrame.from_dict(tweetdict, orient='index')
    df = df.join(pd.DataFrame(df['extended_tweet'].to_dict()).T,rsuffix='_extended')
    df = df.join(pd.DataFrame(df['retweeted_status'].to_dict()).T,rsuffix='_retweeted')
    #df = df.join(pd.DataFrame(df['quoted_status'].to_dict()).T,rsuffix='_quoted')
    #Get full_text of retweet
    df = df.join(pd.DataFrame(df['extended_tweet_retweeted'].to_dict()).T[['full_text']],rsuffix='_retweeted')
    keep_columns= ['created_at',
                    'id_str',
                    'text',
                    'full_text',
                    'id_str_retweeted',
                    'text_retweeted',
                  'full_text_retweeted']

    df = df[keep_columns]
    
    #For retweets, replace text value with text_retweeted, getting rid of the RT before the text
    #Also applies for retweets
    df['text']=np.where(df['id_str_retweeted'].isnull(),df['text'],df['text_retweeted'])

    return df

In [488]:
#Process Apple and Samsung JSON files using helper functions
apple_dict = createdict('.\Data\clean_apple.json')
samsung_dict = createdict('.\Data\clean_samsung.json')
apple_df=createdtframe(apple_dict)
samsung_df=createdtframe(samsung_dict)

In [489]:
apple_df.head()

Unnamed: 0,created_at,id_str,text,full_text,id_str_retweeted,text_retweeted,full_text_retweeted
1,Sun Jan 27 19:24:51 +0000 2019,1089605170252705799,Ok guys its time for Dimonds available now in ...,,1.0884167102255268e+18,Ok guys its time for Dimonds available now in ...,Ok guys its time for Dimonds available now in ...
2,Sun Jan 27 19:24:58 +0000 2019,1089605199587618817,It makes me chuckle when articles claim that t...,It makes me chuckle when articles claim that t...,,,
3,Sun Jan 27 19:25:01 +0000 2019,1089605211864399874,This was pretty cool! Thank you @apple for hav...,This was pretty cool! Thank you @apple for hav...,,,
4,Sun Jan 27 19:25:06 +0000 2019,1089605236812103680,@BulletinAtomic @POTUS @DAVOS @WEF @ENERGY @YE...,@BulletinAtomic @POTUS @DAVOS @WEF @ENERGY @YE...,,,
5,Sun Jan 27 19:25:26 +0000 2019,1089605318366191616,I’m pretty sure I just discovered that @Family...,I’m pretty sure I just discovered that @Family...,,,


In [477]:
#Retweets
# Retweets should be discarded only if original tweet is in corpus.

#Search for retweet as original tweet in corpus
def filter_retweets(df):
    original_tweet=list(df[df['id_str_retweeted'].notnull()]['id_str_retweeted'])
    #if original tweet is in corpus, discard retweet by id_str_retweeted
    retweets_discard = list(df[df['id_str'].isin(original_tweet)]['id_str'])
    df = df[~df['id_str_retweeted'].isin(retweets_discard)]
    
    return df


In [478]:
#Need to check whether a record is an extended tweeet and if so, replace extended tweet on text field

def extract_extended_tweet(df):
    df['text']=np.where(df['full_text'].isnull(),df['text'],df['full_text'])
    #for retweets
    df['text']=np.where(df['full_text_retweeted'].isnull(),df['text'],df['full_text_retweeted'])
    return df

In [558]:
#Text preprocessing functions

def upper_repl(match):
    return  match.group(1).capitalize()

def clean_html(text):
    soup = BeautifulSoup(text, 'html5lib')    
    souped = soup.get_text()
    return souped

def clean_tweet(tweet):
    #Utility function to clean the text in a tweet by removing links, twitter handles, and special characters using regex
    #keeps basic punctuation because you need it for dependency parsing
    #currently keeps hashtags separated by a space. should be separate hashtags like sentences?
    #re.sub("(@[A-Za-z0-9_]+)|([^0-9A-Za-z\.,:;!?'$# \t])|(\w+:\/\/\S+)", " ", tweet)
    t0=re.sub("([^0-9A-Za-z\.,:;!?'$@# \t])|(\w+:\/\/\S+)", " ", tweet)
    #return list of hashtag terms for reference
    t_handles=re.compile(r"@[A-Za-z0-9_]+").findall(tweet)
    
    #remove hashtag symbol from twitter handles and capitalize terms
    return t_handles, re.sub('@([A-Za-z0-9_]+)',upper_repl,t0)

#Emoticon handling
def remove_emojis(tweet):
    emoji_pattern = re.compile("["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'',tweet)

#need function to remove extra whitespace: all tabs, newlines, and other whitespace-like characters
def remove_whitespace(tweet):
    return re.sub('\s+',' ',tweet).strip()

In [553]:
# t="This was pretty cool! Thank you @apple for having me. And thank you to everyone who came !! Had so much fun, super cool chatting about production and songwriting too! "
# re.findall(r'@([A-Za-z0-9_]+)',t)
# # def upper_repl(match):
# #     return  match.group(1).capitalize()
# re.sub('@([A-Za-z0-9_]+)',upper_repl,t)

# re.sub("([^0-9A-Za-z\.,:;!?'@$# \t])|(\w+:\/\/\S+)", " ", t)

'This was pretty cool! Thank you @apple for having me. And thank you to everyone who came !! Had so much fun, super cool chatting about production and songwriting too! '

In [480]:
def hashtag_list(tweet):
    #return list of hashtag terms for reference
    splithash=re.compile(r"#(\w+)")
    return splithash.findall(tweet)

In [502]:
def handle_hashtags(tweet):
    #Step 1: return text with hashtags separated by semicolons
    p1=r'#(\w+)'
    t=re.sub(p1,r'\1;',tweet)
    #Step 2: split up hashtags consisting of multiple capitalized words into separate words
    t1=' '.join(re.findall(r'[A-Z]?[^A-Z\s]+|[A-Z]+', t))
    #t1=re.sub( r"([A-Z])", r" \1", t)
    #remove whitespace
    t1=remove_whitespace(t1)
    return t1

In [482]:
#Tokenization

def tokenize(df):
    df['tokenized_tweets'] = df.apply(lambda row: word_tokenize(row['sentences']), axis=1)
    #lowercase
    df['tokenized_tweets'] = df['tokenized_tweets'].apply(lambda x: [item.lower() for item in x])
    return df

#Removal of stop words
def remove_stopwords(df):
    stops = set(stopwords.words("english"))
    df['tweet_no_stop']=df['tokenized_tweets'].apply(lambda x: [item for item in x if item not in stops])
    return df


In [559]:
#Apply text preprocessing functions

def preprocess_text(df):
    #In this particular order
    df=filter_retweets(df)
    df=extract_extended_tweet(df)
    df['text']=df['text'].apply(remove_emojis)
    df['text'] = df['text'].apply(clean_html)
    df['Twitter handles'],df['text'] = zip(*df['text'].apply(clean_tweet))
    df['Hashtags'] = df['text'].apply(hashtag_list)
    df['text']=df['text'].apply(handle_hashtags)
    df['text']=df['text'].apply(remove_whitespace)
    return df


In [560]:
#Processed tweets are in df['text'] column
apple_clean=preprocess_text(apple_df)
samsung_clean=preprocess_text(samsung_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/

In [563]:
apple_clean.head()

Unnamed: 0,created_at,id_str,text,full_text,id_str_retweeted,text_retweeted,full_text_retweeted,Twitter handles,Hashtags
1,Sun Jan 27 19:24:51 +0000 2019,1089605170252705799,Ok guys its time for Dimonds available now in ...,,1.0884167102255268e+18,Ok guys its time for Dimonds available now in ...,Ok guys its time for Dimonds available now in ...,[],"[cydia, theme, ios, jailbreak, anemone, iPhone..."
2,Sun Jan 27 19:24:58 +0000 2019,1089605199587618817,It makes me chuckle when articles claim that t...,It makes me chuckle when articles claim that t...,,,,"[@fitbit, @Apple]",[]
3,Sun Jan 27 19:25:01 +0000 2019,1089605211864399874,This was pretty cool! Thank you Apple for havi...,This was pretty cool! Thank you @apple for hav...,,,,[@apple],[todayatapple]
4,Sun Jan 27 19:25:06 +0000 2019,1089605236812103680,Bulletinatomic Potus Davos Wef Energy Yearsofl...,@BulletinAtomic @POTUS @DAVOS @WEF @ENERGY @YE...,,,,"[@BulletinAtomic, @POTUS, @DAVOS, @WEF, @ENERG...","[HSS, LRAD]"
5,Sun Jan 27 19:25:26 +0000 2019,1089605318366191616,I m pretty sure I just discovered that Familyg...,I’m pretty sure I just discovered that @Family...,,,,[@FamilyGuyonFOX],"[FamilyGuy, Apple, ApplePencil]"


In [564]:
samsung_clean.head()

Unnamed: 0,created_at,id_str,text,full_text,id_str_retweeted,text_retweeted,full_text_retweeted,Twitter handles,Hashtags
1,Fri Mar 01 03:01:41 +0000 2019,1101316548176695296,Whitestone Dome Glass for Samsung Galaxy S10 1...,,1.1012940960170228e+18,Whitestone Dome Glass for Samsung Galaxy S10/1...,,[@YouTube],[]
2,Fri Mar 01 03:01:43 +0000 2019,1101316556993134593,Samsung Galaxy S10 Plus Durability Test New Fl...,,1.1013029788147384e+18,Samsung Galaxy S10 Plus Durability Test- New F...,,[@YouTube],[]
3,Fri Mar 01 03:01:50 +0000 2019,1101316588353937408,Samsung Galaxy S10 DON ' T MESS IT UP Youtube,,1.1012613908477624e+18,Samsung Galaxy S10 - DON'T MESS IT UP https://...,,[@YouTube],[]
4,Fri Mar 01 03:01:51 +0000 2019,1101316589729841152,Just saw this on Amazon: Space Pop Sockets ......,Just saw this on Amazon: Space PopSockets ... ...,,,,[@amazon],"[iPhone, iPhoneXS, iPhone8, iPhone7Plus, iPhon..."
5,Fri Mar 01 03:01:51 +0000 2019,1101316592040738816,Will the Galaxy S10 Fingerprint Scanner Work W...,,1.1012337151145329e+18,Will the Galaxy S10+ Fingerprint Scanner Work ...,,[@YouTube],[]


In [565]:
#Save as csv
apple_clean.to_csv('.\\Data\\apple_processed.csv',index=False)
samsung_clean.to_csv('.\\Data\\samsung_processed.csv',index=False)