# Compile Twitter Data
Compile raw .json files from twitter feed into an organized file for performing analytics 

In [1]:
import json
import matplotlib.pyplot as plt
import pandas as pd
import re

In [2]:
path_to_data = 'nuggets5k.json' # use smaller file for development              
tweet_data = []
tweet_file = open(path_to_data, 'r')

In [3]:
for line in tweet_file:
    try:
        tweet = json.loads(line)
        tweet_data.append(tweet)
    except:
        continue

print len(tweet_data)

5007


There are 5007 total tweets that were stored in the `nuggets5k.json` file by the `twitter_mine.py` program.

# Extract and organize data

Examine a tweet to see what kind of data are available 

In [4]:
tweet_data[0]

{u'contributors': None,
 u'coordinates': None,
 u'created_at': u'Wed Sep 28 18:18:17 +0000 2016',
 u'entities': {u'hashtags': [{u'indices': [43, 54], u'text': u'DirectChat'},
   {u'indices': [85, 95], u'text': u'AdultWork'}],
  u'symbols': [],
  u'urls': [],
  u'user_mentions': [{u'id': 2574814004,
    u'id_str': u'2574814004',
    u'indices': [3, 18],
    u'name': u'PrincessJessika',
    u'screen_name': u'JessikaDommeUK'}]},
 u'favorite_count': 0,
 u'favorited': False,
 u'filter_level': u'low',
 u'geo': None,
 u'id': 781196347613536256,
 u'id_str': u'781196347613536256',
 u'in_reply_to_screen_name': None,
 u'in_reply_to_status_id': None,
 u'in_reply_to_status_id_str': None,
 u'in_reply_to_user_id': None,
 u'in_reply_to_user_id_str': None,
 u'is_quote_status': False,
 u'lang': u'en',
 u'place': None,
 u'retweet_count': 0,
 u'retweeted': False,
 u'retweeted_status': {u'contributors': None,
  u'coordinates': None,
  u'created_at': u'Wed Sep 28 17:32:21 +0000 2016',
  u'entities': {u'hash

To begin, extract the following fields into a data file:
* text (the actual content of the tweet -- for which some NLP may be done)
* lang
* location
* hashtags (keywords, can serve as proxy for the content subject)
* retweeted (boolean -- is this content worth repeating?)
* followers_count (how many people is this content reaching?)
* friends_count (how many people is this content reaching?)
* name (how many unique users posting about these topics?)

In [5]:
def add_column(keys, col_name):
    '''
    Most of the data that are of interest are at depth one or two in each ta
    json object.  Obtain the data and append to the df as a new column.
    
    @param keys: array of keys that identify the value location of the 
       data.
    @param col_name: name of the column to be created
    
    @return: void (modifies df in-place)
    '''
    data = []
    
    for tweet in tweet_data:
        if len(keys) == 1:
            if tweet.has_key(keys[0]):
                data.append(tweet[keys[0]])
            else:
                data.append(None)
        elif len(keys) == 2:
            if tweet.has_key(keys[0]) and tweet[keys[0]].has_key(keys[1]):
                data.append(tweet[keys[0]][keys[1]])
            else:
                data.append(None)
        else: 
            print 'Function not designed to handle key depth > 2'
            return
    df[col_name] = data

In [6]:
df = pd.DataFrame()
add_column(['text'], 'text')
add_column(['user', 'lang'], 'user_lang')
add_column(['user', 'location'], 'location')
add_column(['entities', 'hashtags'], 'hashtags')
add_column(['retweeted'], 'retweet')
add_column(['retweet_count'], 'n_retweet')
add_column(['user', 'followers_count'], 'n_followers')
add_column(['user', 'friends_count'], 'n_friends')
add_column(['user', 'name'], 'name')
add_column(['coordinates'], 'coords')

In [7]:
# Examine output
df.head()

Unnamed: 0,text,user_lang,location,hashtags,retweet,n_retweet,n_followers,n_friends,name,coords
0,RT @JessikaDommeUK: I am now available for #Di...,en,UK,"[{u'indices': [43, 54], u'text': u'DirectChat'...",False,0,12633,933,⭐️AdultworksFinest⭐️,
1,yNewsbot: Bugtraq: #Cisco #Security #Advisory:...,en,tryingto #^fi~{|__#}%%*,"[{u'indices': [19, 25], u'text': u'Cisco'}, {u...",False,0,2636,1084,❌Siraj Solution❌,
2,RT @mims: Amazon is going to compete directly ...,en,,[],False,0,2624,4916,Leroy Wesley Annon,
3,#Cisco Battles Shadow Broker Exploits. Read mo...,en,"San Mateo, CA","[{u'indices': [0, 6], u'text': u'Cisco'}]",False,0,86,2,Networking News,
4,#Cisco Battles Shadow Broker Exploits. Read mo...,en,"San Mateo, CA","[{u'indices': [0, 6], u'text': u'Cisco'}]",False,0,20,2,Communication News,


Latitude/longitude coordinates are potentially of interest to some applications.  Unfortunately, they exist for < 1% of the data.

In [8]:
coords = [cd for cd in df['coords'] if cd is not None]
print 'No. data with coords:', len(coords)
print 'Total samples in data set:', len(df['coords'])
print 'Fraction with coords:', float(len(coords)) / len(df['coords'])
for c in coords:
    print c, '\n'
    

No. data with coords: 15
Total samples in data set: 5007
Fraction with coords: 0.00299580587178
{u'type': u'Point', u'coordinates': [-121.8863286, 37.3382082]} 

{u'type': u'Point', u'coordinates': [-84.3879824, 33.7489954]} 

{u'type': u'Point', u'coordinates': [7.5949121, 50.3680029]} 

{u'type': u'Point', u'coordinates': [-95.6090009, 29.7801889]} 

{u'type': u'Point', u'coordinates': [-82.21336234, 27.03644926]} 

{u'type': u'Point', u'coordinates': [-98.4936282, 29.4241219]} 

{u'type': u'Point', u'coordinates': [-76.7238, 39.192978]} 

{u'type': u'Point', u'coordinates': [-78.898619, 35.9940329]} 

{u'type': u'Point', u'coordinates': [-91.4894045, 41.6838135]} 

{u'type': u'Point', u'coordinates': [-97.7430608, 30.267153]} 

{u'type': u'Point', u'coordinates': [-1.5359478, 53.7988358]} 

{u'type': u'Point', u'coordinates': [-1.5359478, 53.7988358]} 

{u'type': u'Point', u'coordinates': [-87.6297982, 41.8781136]} 

{u'type': u'Point', u'coordinates': [-77.1772604, 38.9338676]} 

{

# Some Basic Text Processing
In this section, determine which of the keywords are found in each tweet
and keep track of this info in additional columns.

In [9]:
def word_in_text(word, text):
    '''                                                                         
    Search for word in text (case insensitive), return True if found, else      
    return False                                                                
    '''
    match =  re.search(word.lower(), text.lower())
    if match:
        return True
    return False

In [10]:
# Test
ex_text = 'This is a text string.'
print 'text:', ex_text
print '"this" in text:', word_in_text('this', ex_text)
print '"TEXT" in text:', word_in_text('TEXT', ex_text)
print '"word" in text:', word_in_text('word', ex_text)

text: This is a text string.
"this" in text: True
"TEXT" in text: True
"word" in text: False


In [11]:
keywords = ['Cisco', 'CCNA', 'AWS', 'Sharepoint', 'linux', 'powershell',
            'itil', 'sql', 'azure', 'ceh']

In [12]:
for word in keywords:
    df[word] = df['text'].apply(lambda tweet: word_in_text(word, tweet)
                                if tweet else None)
    
df.head()

Unnamed: 0,text,user_lang,location,hashtags,retweet,n_retweet,n_followers,n_friends,name,coords,Cisco,CCNA,AWS,Sharepoint,linux,powershell,itil,sql,azure,ceh
0,RT @JessikaDommeUK: I am now available for #Di...,en,UK,"[{u'indices': [43, 54], u'text': u'DirectChat'...",False,0,12633,933,⭐️AdultworksFinest⭐️,,False,False,False,False,False,False,False,False,False,False
1,yNewsbot: Bugtraq: #Cisco #Security #Advisory:...,en,tryingto #^fi~{|__#}%%*,"[{u'indices': [19, 25], u'text': u'Cisco'}, {u...",False,0,2636,1084,❌Siraj Solution❌,,True,False,False,False,False,False,False,False,False,False
2,RT @mims: Amazon is going to compete directly ...,en,,[],False,0,2624,4916,Leroy Wesley Annon,,False,False,True,False,False,False,False,False,False,False
3,#Cisco Battles Shadow Broker Exploits. Read mo...,en,"San Mateo, CA","[{u'indices': [0, 6], u'text': u'Cisco'}]",False,0,86,2,Networking News,,True,False,False,False,False,False,False,False,False,False
4,#Cisco Battles Shadow Broker Exploits. Read mo...,en,"San Mateo, CA","[{u'indices': [0, 6], u'text': u'Cisco'}]",False,0,20,2,Communication News,,True,False,False,False,False,False,False,False,False,False


Write DataFrame to .csv for potential further processing
Note: encoding defaults to ascii, but will throw error if non-ascii text  found (e.g., international text), hence the utf-8 encoding argument.

In [13]:
df.to_csv('twitter.csv', encoding = 'utf-8')

In [14]:
# Alternately pickle as a Python object
df.to_pickle('twitter_pickle')

Analyses are conducted in the following notebook: 3_twitter_analysis.ipynb