#Loading Data to Classification
## Census Data
The most commom names and surnames were extracted from US Census data in order to use in a heuristic to classify tweet actors into the base categories

In [74]:
import pandas as pd

female_first_name = pd.read_csv('data/names/census-dist-female-first.txt', sep='\s+', header=None)
male_first_name = pd.read_csv('data/names/census-dist-male-first.txt', sep='\s+', header=None)
surnames = pd.read_csv('data/names/surnames.txt', sep='\s+', header=None)

concat_names = pd.concat([female_first_name[0], male_first_name[0], surnames[0]], ignore_index=True)
print(concat_names.head())
print(len(concat_names))

0         MARY
1     PATRICIA
2        LINDA
3      BARBARA
4    ELIZABETH
Name: 0, dtype: object
94293


## Raw Tweet Data
The raw tweet data was exported from Kinesis to S3, where all gnip formatted data is available for download

- Transform old s3 files that were separated by spaces and transform into \n separated

In [47]:
import json

from os import listdir
from os.path import isfile, join

space_delimited_files = [ f for f in listdir("data/raw/space_delimited") if isfile(join("data/raw/space_delimited",f)) ]

raw_tweets = None 
for file_name in space_delimited_files:
    with open('data/raw/space_delimited/' + file_name) as o:
        with open('data/raw/' + file_name, "a+") as w
            w.write("\n".join(['{' + l + '}' for l in content[1:-1].split('} {')]))

- Load json parsed files in memory to further generate necessary csv files

In [116]:
import glob

json_files = glob.glob("data/raw/*-*")
tweets = []
for json_file in json_files:
    with open(json_file, encoding="utf8") as json_content:
        print('Parsing', json_file, 'reading json:') 
        json_lines = json_content.read().split("}}\n")
        for l in json_lines:
            try:
                if l:
                    tweets.append(json.loads(l + "}}"))
            except:
                print('Error parsing file', json_file, 'reading json:', l + "}}") 
                print('Parsed', len(json_array), 'already!')
                raise
        print('### Parsed', len(json_array), 'total!')

Parsing data/raw/49551698162644073620788046296223605315150229189597069314-49551698162644073620788046299457481882619403463622131714 reading json:
### Parsed 2675 total!
Parsing data/raw/49551698162644073620788046299458690808439018092796837890-49551698162644073620788046302206579196423111507309494274 reading json:
### Parsed 2675 total!
Parsing data/raw/49551698162644073620788046302207788122242726205203677186-49551698162644073620788046304909737329081463711077498882 reading json:
### Parsed 2675 total!
Parsing data/raw/49551698162666374365986576993647588019643763754950197266-49551698162666374365986576996833107554328352930706489362 reading json:
### Parsed 2675 total!
Parsing data/raw/49551698162666374365986576996834316480147967628600672274-49551698162666374365986576999548354945182851357502078994 reading json:
### Parsed 2675 total!
Parsing data/raw/49551698162666374365986576999549563871002465986676785170-49551698162666374365986577002123366940962052731312275474 reading json:
### Parsed 2675

- Persist necessary attributes in csv files for further loading in pandas dataframe for analisys

In [163]:
import csv

with open('data/csv/json_to_csv_tweets.csv', 'w') as csv_file:
    tweets_writer = csv.writer(csv_file)
    tweets_writer.writerow([
        'actor_id',
        'actor_screen_name',
        'actor_name',
        'actor_verified',
        'actor_friends_count',
        'actor_followers_count',
        'actor_listed_count',
        'actor_statuses_count',
        'actor_favorites_count',
        'actor_summary',
        'actor_created_at',
        'actor_location',
        
        'tweet_id',
        'tweet_created_at',
        'tweet_generator',
        'tweet_body',
        'tweet_verb',
            
        'tweet_urls_count',
        'tweet_mentions_count',
        'tweet_hashtags_count',
        'tweet_trends_count',
        'tweet_symbols_count'])
    for tweet in tweets:
        tweets_writer.writerow([
                tweet['actor']['id'],
                tweet['actor']['preferredUsername'],
                tweet['actor']['displayName'],
                tweet['actor']['verified'],
                tweet['actor']['friendsCount'],
                tweet['actor']['followersCount'],
                tweet['actor']['listedCount'],
                tweet['actor']['statusesCount'],
                tweet['actor']['favoritesCount'],
                tweet['actor']['summary'],
                tweet['actor']['postedTime'],
                tweets[1]['actor']['location']['displayName'] if tweets[1]['actor'].get('location') else 'null',
                
                tweet['id'],
                tweet['postedTime'],
                tweet['generator']['displayName'],
                tweet['body'],
                tweet['verb'],
                
                len(tweet['twitter_entities']['urls']),
                len(tweet['twitter_entities']['user_mentions']),
                len(tweet['twitter_entities']['hashtags']),
                len(tweet['twitter_entities']['trends']),
                len(tweet['twitter_entities']['symbols'])
            ])


#CSV file with actor and tweet information, no aggregation

This data will be used to run a few analisys and to train items into the categories we want to buld the classifier for
- Business
- Person
- Bot

In order to execute the training we will create a few heuristics

In [180]:
df_tweets = pd.read_csv('data/csv/json_to_csv_tweets.csv')
df_tweets.head()

Unnamed: 0,actor_id,actor_screen_name,actor_name,actor_verified,actor_friends_count,actor_followers_count,actor_listed_count,actor_statuses_count,actor_favorites_count,actor_summary,...,tweet_id,tweet_created_at,tweet_generator,tweet_body,tweet_verb,tweet_urls_count,tweet_mentions_count,tweet_hashtags_count,tweet_trends_count,tweet_symbols_count
0,id:twitter.com:474197738,shannon_jaeger,Shannon Jaeger,False,617,445,44,4642,797,"Software developer from websites, to games, to...",...,"tag:search.twitter.com,2005:613058481197715456",2015-06-22T18:58:26.000Z,Twitter for BlackBerry,RT @TechCrunch: 6 ways to build a more diverse...,share,1,1,0,0,0
1,id:twitter.com:2297542010,nurse17194149,nurseな日々,False,235,193,7,43581,0,犬とスイーツ、ランニングが好きです。,...,"tag:search.twitter.com,2005:613058480489017344",2015-06-22T18:58:26.000Z,IFTTT,US Open champion Jordan Spieth eyes grand slam...,post,1,0,0,0,0
2,id:twitter.com:2513460271,All_Jobs_,All Jobs,False,928,708,73,202676,0,Tweets with latest #Jobs #Offers !!,...,"tag:search.twitter.com,2005:613058482833469440",2015-06-22T18:58:27.000Z,twitterfeed,#Cleveland #Job Traveling Assistant - PrimeSou...,post,1,0,3,0,0
3,id:twitter.com:468841295,Oscar_Gomez58,Oscar,False,285,547,2,93862,36816,Gods #1 Donna Neri is my one true love ❤️ IG:O...,...,"tag:search.twitter.com,2005:613058485551415296",2015-06-22T18:58:27.000Z,Twitter for iPhone,RT @NFLRT: Cam Newton doesn't have time for th...,share,1,1,0,0,0
4,id:twitter.com:1720650906,mukndv,Mukund,False,317,311,0,1111,3901,Prospective linebacker for alabama\nweight lif...,...,"tag:search.twitter.com,2005:613058483576045568",2015-06-22T18:58:27.000Z,Twitter for iPhone,RT @verge: The US Army is getting hoverbikes h...,share,1,1,0,0,0


## Determining Devices for Users

We believe that most posts made directly in twitter web and using phone clients should be tweets posted by persons and not businesses

In [183]:
device = df_tweets[df_tweets['tweet_verb'] == 'post'][['tweet_generator', 'tweet_id']]

In [193]:
posts_by_device = device.groupby('tweet_generator').count()
posts_by_device['percentage'] = (posts_by_device.tweet_id / posts_by_device.tweet_id.sum()) * 100
posts_by_device = posts_by_device[['percentage']].sort('percentage', ascending=False)

print(posts_by_device.head(10))

                     percentage
tweet_generator                
Twitter Web Client    34.103720
IFTTT                 11.992672
Twitter for iPhone     9.949267
Hootsuite              5.876550
dlvr.it                5.284667
twitterfeed            4.241826
Twitter for Android    3.523112
TweetDeck              2.621195
Buffer                 2.170237
Facebook               1.338782


In [195]:
client_devices = device[device.tweet_generator.str.startswith('Twitter ')].groupby('tweet_generator').count()
client_devices['percentage'] = (client_devices.tweet_id / client_devices.tweet_id.sum()) * 100
client_devices = client_devices[['percentage']].sort('percentage', ascending=False)

print(client_devices)

                             percentage
tweet_generator                        
Twitter Web Client            69.640288
Twitter for iPhone            20.316547
Twitter for Android            7.194245
Twitter for iPad               1.410072
Twitter for Mac                0.489209
Twitter for BlackBerry         0.230216
Twitter for Windows Phone      0.230216
Twitter Ads                    0.143885
Twitter for BlackBerry®        0.143885
Twitter for Android Tablets    0.086331
Twitter for  Android           0.057554
Twitter for Windows            0.057554


In [205]:
all_filtered_devices = device[device.tweet_generator.str.startswith('Twitter ')].tweet_generator.unique()
non_client_devices = ['Twitter Ads']
client_devices = [item for item in all_filtered_devices if item not in non_client_devices]
print(client_devices)

['Twitter Web Client', 'Twitter for iPhone', 'Twitter for Android', 'Twitter for iPad', 'Twitter for  Android', 'Twitter for Windows', 'Twitter for Mac', 'Twitter for Android Tablets', 'Twitter for BlackBerry®', 'Twitter for Windows Phone', 'Twitter for BlackBerry']


In [221]:
filtered_by_devices_tweets = df_tweets[df_tweets.tweet_generator.isin(client_devices)][df_tweets.tweet_verb == 'post']
print(len(filtered_by_devices_tweets))
filtered_by_devices_tweets[df_tweets.tweet_verb == 'post'][['actor_screen_name', 'actor_summary', 'tweet_body', 'tweet_generator']].head()

3470




Unnamed: 0,actor_screen_name,actor_summary,tweet_body,tweet_generator
7,GunJoyWebsite,http://GunJoy.com is a website dedicated to th...,$74.99 - UA Reliance Tactical Sunglasses | Und...,Twitter Web Client
11,cafoxorixaty,,FREE #porn and #bi #sex on #pussy http://t.co/...,Twitter Web Client
22,royjulie4,Be careful who you call your friends. I'd rath...,@saira36863892 Need new kicks? Got old kicks t...,Twitter Web Client
23,djpeefunk,"Giving you the latest Hip-Hop, R&B, and Gospel...",#Nowplaying #HipHop #RnB #ALLinCLE Tune in htt...,Twitter Web Client
29,nilcat_jj,Don Mathieuinho JR,@nba2kcodefishy @TremoniaRules @Ronnie2K do u ...,Twitter for iPhone
