In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from langdetect import detect
import glob
import os

In [2]:
def isEnglish(s):
    try:
        str(s).encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [3]:
def load_data():

    tweets_data_path = 'twtdata/00.json'

    tweets_data = []
    tweets_file = open(tweets_data_path, encoding='utf-8')
    for line in tweets_file:
        try:
            tweet = json.loads(line)
            if isEnglish(tweet['text']):
                tweets_data.append(tweet)
        except:
            continue
            
    return tweets_data

In [4]:
def get_twt_attr(twt, attr_key):
    if twt == None:
        return None
    
    if attr_key in twt:
        return twt[attr_key]
    
    return None

In [5]:
def populate_dataframe(tweets_data):
    columns = ['created_at', 'text', 'user_id', 'user_description', 'user_followers_count', 
               'user_time_zone', 'user_lang', 'timestamp_ms', 'retweet_count', 'favorite_count', 'hashtags']

    tweets = pd.DataFrame(columns=columns)

    for twt in tweets_data:
        twt_data = { 
                    'created_at': get_twt_attr(twt, 'created_at'), 
                    'text': get_twt_attr(twt, 'text'), 
                    'user_id': get_twt_attr(get_twt_attr(twt, 'user'), 'id'), 
                    'user_description': get_twt_attr(get_twt_attr(twt, 'user'), 'description'), 
                    'user_followers_count': get_twt_attr(get_twt_attr(twt, 'user'), 'followers_count'), 
                    'user_time_zone': get_twt_attr(get_twt_attr(twt, 'user'), 'time_zone'), 
                    'user_lang': get_twt_attr(get_twt_attr(twt, 'user'), 'lang'), 
                    'timestamp_ms': get_twt_attr(twt, 'timestamp_ms'), 
                    'retweet_count': get_twt_attr(twt, 'retweet_count'), 
                    'favorite_count': get_twt_attr(twt, 'favorite_count'), 
                    'hashtags': get_twt_attr(get_twt_attr(twt, 'entities'), 'hashtags') 
                    };

        tweets.loc[len(tweets)] = twt_data
        
    
    tweets = tweets[tweets['text'].notna()]
    tweets = tweets[tweets['user_time_zone'].notna()]
    tweets = tweets.loc[tweets.user_time_zone.str.contains('(US & Canada)')]
    tweets = tweets.loc[tweets.user_lang.str.contains('en')]
    tweets['created_at'] = pd.to_datetime(tweets['created_at'])
    tweets['created_at'] = tweets['created_at'].dt.date
    
    tweets = tweets.drop(columns=['user_description', 'user_time_zone', 'user_lang', 'timestamp_ms', 'user_id'])
    
    return tweets
    

In [6]:
path = 'twtdata/'
counter = 1
for filename in glob.glob(os.path.join(path, '*.json')):
    tweets_data = []
    with open(filename, 'r') as f: 
        for tw in f:
            try:
                tweet = json.loads(tw)
                if isEnglish(tweet['text']):
                    tweets_data.append(tweet)
            except:
                continue
            
    tweets = populate_dataframe(tweets_data)
    
    print(str(counter) + '. processing - ' + filename + ' - len:' + str(len(tweets)))
    
    tweets.to_csv('processed_data/master_data.csv', mode='a', header=False)
    
    counter += 1
#tweets.head()



1. processing - twtdata/05.json - len:206
2. processing - twtdata/03.json - len:215
3. processing - twtdata/33.json - len:200
4. processing - twtdata/19.json - len:173
5. processing - twtdata/13.json - len:221
6. processing - twtdata/10.json - len:191
7. processing - twtdata/11.json - len:216
8. processing - twtdata/02.json - len:199
9. processing - twtdata/59.json - len:190
10. processing - twtdata/26.json - len:185
11. processing - twtdata/36.json - len:198
12. processing - twtdata/37.json - len:177
13. processing - twtdata/45.json - len:196
14. processing - twtdata/58.json - len:166
15. processing - twtdata/15.json - len:178
16. processing - twtdata/52.json - len:183
17. processing - twtdata/28.json - len:211
18. processing - twtdata/17.json - len:166
19. processing - twtdata/38.json - len:187
20. processing - twtdata/24.json - len:162
21. processing - twtdata/29.json - len:188
22. processing - twtdata/41.json - len:158
23. processing - twtdata/35.json - len:188
24. processing - twt

In [25]:
tweets.tail()

Unnamed: 0,created_at,text,user_followers_count,retweet_count,favorite_count,hashtags
934,2016-12-29,@Vachie Hey there. Let's take a look into it f...,153073,0,0,[]
945,2016-12-29,@ashbonicole aw have her make me some idc what...,397,0,0,[]
949,2016-12-29,RT @monikapenelopa2: https://t.co/Bf8DWN8UY4,137,0,0,[]
950,2016-12-29,RT @Jordan_Fisher: @_mandygonzalez is a goddes...,376,0,0,[]
951,2016-12-29,RT @pettyyonceh: one of the best rock songs ev...,764,0,0,[]


In [26]:
len(tweets)

240

In [15]:
#tweets.to_csv('processed_data/master_data.csv')

In [16]:
#tweets.drop(tweets[detect(str(tweets.text)) == 'en'].index, inplace=True)

In [27]:
#df = pd.read_json('processed_data/master_data.csv')

file_path = 'processed_data/master_data.csv'
df = pd.read_csv(file_path)
df.head()


Unnamed: 0.1,Unnamed: 0,created_at,text,user_followers_count,retweet_count,favorite_count,hashtags
0,0,2016-12-29,Happy birthday to say you when I see you (here...,36.0,0,0.0,[]
1,7,2016-12-29,"I can smile, and its not. I have people that m...",1376.0,0,0.0,[]
2,11,2016-12-29,luh HAHAHAHAHAHAHA,124.0,0,0.0,[]
3,16,2016-12-29,@Baelizean word?,3628.0,0,0.0,[]
4,17,2016-12-29,GURUS Invoke Lack Of Trust. We Are Here To Pro...,2574.0,0,0.0,"[{'text': 'trust', 'indices': [51, 57]}, {'tex..."


In [28]:
len(df)

47428