#Loading Data to Classification
## Census Data
The most commom names and surnames were extracted from US Census data in order to use in a heuristic to classify tweet actors into the base categories

In [2]:
import pandas as pd

female_first_name = pd.read_csv('data/names/census-dist-female-first.txt', sep='\s+', header=None)
male_first_name = pd.read_csv('data/names/census-dist-male-first.txt', sep='\s+', header=None)
surnames = pd.read_csv('data/names/surnames.txt', sep='\s+', header=None)

us_names = pd.concat([female_first_name[0], male_first_name[0], surnames[0]], ignore_index=True)
us_names = set(us_names.str.lower().tolist())
print(len(us_names))

91909


## Raw Tweet Data
The raw tweet data was exported from Kinesis to S3, where all gnip formatted data is available for download

- Transform old s3 files that were separated by spaces and transform into \n separated

In [3]:
#import json

#from os import listdir
#from os.path import isfile, join

#space_delimited_files = [ f for f in listdir("data/raw/space_delimited") if isfile(join("data/raw/space_delimited",f)) ]

#raw_tweets = None 
#for file_name in space_delimited_files:
#    with open('data/raw/space_delimited/' + file_name) as o:
#        with open('data/raw/' + file_name, "a+") as w
#            w.write("\n".join(['{' + l + '}' for l in content[1:-1].split('} {')]))

- Load json parsed files in memory to further generate necessary csv files

In [11]:
import glob
import json

json_files = glob.glob("data/raw/*-*")
tweets = []
for json_file in json_files:
    with open(json_file, encoding="utf8") as json_content:
        json_lines = json_content.read().split("}}\n")
        for l in json_lines:
            try:
                if l:
                    tweets.append(json.loads(l + "}}"))
            except:
                print('Error parsing file', json_file, 'reading json:', l + "}}") 
                raise
        print('# Parsed', len(json_lines), 'lines from', json_file)
        print('### Parsed', len(tweets), 'in total')

# Parsed 2676 lines from data/raw/49551698162644073620788046296223605315150229189597069314-49551698162644073620788046299457481882619403463622131714
### Parsed 2675 in total
# Parsed 2275 lines from data/raw/49551698162644073620788046299458690808439018092796837890-49551698162644073620788046302206579196423111507309494274
### Parsed 4949 in total
# Parsed 2237 lines from data/raw/49551698162644073620788046302207788122242726205203677186-49551698162644073620788046304909737329081463711077498882
### Parsed 7185 in total
# Parsed 2024 lines from data/raw/49551698162644073620788046595537922216082340233330294786-49551698162644073620788046597982370223343161793711177730
### Parsed 9208 in total
# Parsed 2333 lines from data/raw/49551698162644073620788046597983579149162776422885883906-49551698162644073620788046600801585234684518260812021762
### Parsed 11540 in total
# Parsed 2117 lines from data/raw/49551698162644073620788046600802794160504133027425681410-4955169816264407362078804660335967226898911

- Persist necessary attributes in csv files for further loading in pandas dataframe for analisys

In [12]:
import csv

with open('data/csv/untrained_tweets.csv', 'w') as csv_file:
    tweets_writer = csv.writer(csv_file)
    tweets_writer.writerow([
        'actor_id',
        'actor_screen_name',
        'actor_name',
        'actor_verified',
        'actor_friends_count',
        'actor_followers_count',
        'actor_listed_count',
        'actor_statuses_count',
        'actor_favorites_count',
        'actor_summary',
        'actor_created_at',
        'actor_location',
        
        'tweet_id',
        'tweet_created_at',
        'tweet_generator',
        'tweet_body',
        'tweet_verb',
            
        'tweet_urls_count',
        'tweet_mentions_count',
        'tweet_hashtags_count',
        'tweet_trends_count',
        'tweet_symbols_count'])
    for tweet in tweets:
        tweets_writer.writerow([
                tweet['actor']['id'],
                tweet['actor']['preferredUsername'],
                tweet['actor']['displayName'],
                tweet['actor']['verified'],
                tweet['actor']['friendsCount'],
                tweet['actor']['followersCount'],
                tweet['actor']['listedCount'],
                tweet['actor']['statusesCount'],
                tweet['actor']['favoritesCount'],
                tweet['actor']['summary'],
                tweet['actor']['postedTime'],
                tweets[1]['actor']['location']['displayName'] if tweets[1]['actor'].get('location') else 'null',
                
                tweet['id'],
                tweet['postedTime'],
                tweet['generator']['displayName'],
                tweet['body'],
                tweet['verb'],
                
                len(tweet['twitter_entities']['urls']),
                len(tweet['twitter_entities']['user_mentions']),
                len(tweet['twitter_entities']['hashtags']),
                len(tweet['twitter_entities']['trends']),
                len(tweet['twitter_entities']['symbols'])
            ])


#CSV file with actor and tweet information, no aggregation

This data will be used to run a few analisys and to train items into the categories we want to buld the classifier for
- Business
- Person
- Bot

In order to execute the training we will create a few heuristics

In [13]:
df_tweets = pd.read_csv('data/csv/untrained_tweets.csv')
df_tweets = df_tweets.dropna(subset=['actor_summary', 'tweet_generator'])
print(len(df_tweets))
df_tweets.head()

47471


Unnamed: 0,actor_id,actor_screen_name,actor_name,actor_verified,actor_friends_count,actor_followers_count,actor_listed_count,actor_statuses_count,actor_favorites_count,actor_summary,...,tweet_id,tweet_created_at,tweet_generator,tweet_body,tweet_verb,tweet_urls_count,tweet_mentions_count,tweet_hashtags_count,tweet_trends_count,tweet_symbols_count
0,id:twitter.com:474197738,shannon_jaeger,Shannon Jaeger,False,617,445,44,4642,797,"Software developer from websites, to games, to...",...,"tag:search.twitter.com,2005:613058481197715456",2015-06-22T18:58:26.000Z,Twitter for BlackBerry,RT @TechCrunch: 6 ways to build a more diverse...,share,1,1,0,0,0
1,id:twitter.com:2297542010,nurse17194149,nurseな日々,False,235,193,7,43581,0,犬とスイーツ、ランニングが好きです。,...,"tag:search.twitter.com,2005:613058480489017344",2015-06-22T18:58:26.000Z,IFTTT,US Open champion Jordan Spieth eyes grand slam...,post,1,0,0,0,0
2,id:twitter.com:2513460271,All_Jobs_,All Jobs,False,928,708,73,202676,0,Tweets with latest #Jobs #Offers !!,...,"tag:search.twitter.com,2005:613058482833469440",2015-06-22T18:58:27.000Z,twitterfeed,#Cleveland #Job Traveling Assistant - PrimeSou...,post,1,0,3,0,0
3,id:twitter.com:468841295,Oscar_Gomez58,Oscar,False,285,547,2,93862,36816,Gods #1 Donna Neri is my one true love ❤️ IG:O...,...,"tag:search.twitter.com,2005:613058485551415296",2015-06-22T18:58:27.000Z,Twitter for iPhone,RT @NFLRT: Cam Newton doesn't have time for th...,share,1,1,0,0,0
4,id:twitter.com:1720650906,mukndv,Mukund,False,317,311,0,1111,3901,Prospective linebacker for alabama\nweight lif...,...,"tag:search.twitter.com,2005:613058483576045568",2015-06-22T18:58:27.000Z,Twitter for iPhone,RT @verge: The US Army is getting hoverbikes h...,share,1,1,0,0,0


## Determining Devices for Users

We believe that most posts made directly in twitter web and using phone clients should be tweets posted by persons and not businesses

In [14]:
device = df_tweets[['tweet_generator', 'tweet_id']]

In [15]:
posts_by_device = device.groupby('tweet_generator').count()
posts_by_device['percentage'] = (posts_by_device.tweet_id / posts_by_device.tweet_id.sum()) * 100
posts_by_device = posts_by_device[['percentage']].sort('percentage', ascending=False)

print(posts_by_device.head(10))

                     percentage
tweet_generator                
Twitter for iPhone    29.782393
Twitter Web Client    15.754882
Twitter for Android   11.760865
IFTTT                  8.917023
twitterfeed            3.728592
dlvr.it                3.635904
Hootsuite              3.564281
TweetDeck              3.016579
Twitter for iPad       1.824272
RoundTeam              1.662067


In [16]:
client_devices = device[device.tweet_generator.str.startswith('Twitter ')].groupby('tweet_generator').count()
client_devices['percentage'] = (client_devices.tweet_id / client_devices.tweet_id.sum()) * 100
client_devices = client_devices[['percentage']].sort('percentage', ascending=False)

print(client_devices)

                             percentage
tweet_generator                        
Twitter for iPhone            49.191051
Twitter Web Client            26.022059
Twitter for Android           19.425211
Twitter for iPad               3.013117
Twitter for Windows Phone      0.782854
Twitter for Mac                0.476671
Twitter for BlackBerry         0.396646
Twitter for BlackBerry®        0.236596
Twitter for Android Tablets    0.215720
Twitter Ads                    0.142653
Twitter for Windows            0.069587
Twitter for  Android           0.020876
Twitter for Apple Watch        0.006959


In [17]:
all_filtered_devices = device[device.tweet_generator.str.startswith('Twitter ')].tweet_generator.unique()
non_client_devices = ['Twitter Ads']
client_devices = [item for item in all_filtered_devices if item not in non_client_devices]
print(client_devices)

['Twitter for BlackBerry', 'Twitter for iPhone', 'Twitter Web Client', 'Twitter for Android', 'Twitter for iPad', 'Twitter for Android Tablets', 'Twitter for  Android', 'Twitter for Windows', 'Twitter for Windows Phone', 'Twitter for Mac', 'Twitter for BlackBerry®', 'Twitter for Apple Watch']


In [18]:
df_tweets['score'] = df_tweets.tweet_generator.map(lambda v: 0.33 if v in client_devices else 0)
filtered_by_devices_tweets = df_tweets[df_tweets.tweet_generator.isin(client_devices)]
print('Tweets filtered by posting device:', len(filtered_by_devices_tweets))
filtered_by_devices_tweets[['actor_screen_name', 'actor_summary', 'tweet_body', 'tweet_generator']].head()

Tweets filtered by posting device: 28700


Unnamed: 0,actor_screen_name,actor_summary,tweet_body,tweet_generator
0,shannon_jaeger,"Software developer from websites, to games, to...",RT @TechCrunch: 6 ways to build a more diverse...,Twitter for BlackBerry
3,Oscar_Gomez58,Gods #1 Donna Neri is my one true love ❤️ IG:O...,RT @NFLRT: Cam Newton doesn't have time for th...,Twitter for iPhone
4,mukndv,Prospective linebacker for alabama\nweight lif...,RT @verge: The US Army is getting hoverbikes h...,Twitter for iPhone
7,GunJoyWebsite,http://GunJoy.com is a website dedicated to th...,$74.99 - UA Reliance Tactical Sunglasses | Und...,Twitter Web Client
8,CptnRivers98,Happy AF // Senior // Future RMD Midfielder //...,RT @SneakerNews: Under Armour making moves. Th...,Twitter for iPhone


#Determine humans by ocurrence of Name and Summary Personal Description
This heuristic will check for accounts which the name of the actor is fully represented by american first names and surnames, together with a description containing "I'm" or "I am"

In [19]:
df_tweets['split_actor_name'] = df_tweets.actor_name.str.lower().str.split(' ')
df_tweets = df_tweets.dropna()

In [20]:
import re

pattern = "i'|i am|ceo|cto|cfo|cmo|editor|founder|director|programmer|curator|designer|entrepreneur|marketer|assistant|lead|experienced|developer|executive|writer|reader|my"

print('Total Tweets:', len(df_tweets))
df_tweets['score'] += df_tweets.actor_summary.map(lambda s: 0.33 if re.match(pattern, s.lower()) else 0)
summary_filtered_tweet = df_tweets[df_tweets.actor_summary.str.lower().str.contains(pattern)]
print('Tweets filtered by summary:', len(summary_filtered_tweet))
df_tweets['score'] += df_tweets.split_actor_name.map(lambda an: 0.33 if set(an).issubset(us_names) else 0)
name_filtered_tweet = df_tweets[df_tweets.split_actor_name.apply(lambda x: set(x).issubset(us_names))]
print('Tweets filtered by name:', len(name_filtered_tweet))
summary_name_filtered_tweet = df_tweets[df_tweets.actor_summary.str.lower().str.contains(pattern)][df_tweets.split_actor_name.apply(lambda x: set(x).issubset(us_names))]
print('Tweets filtered by name and summary:', len(summary_name_filtered_tweet))

Total Tweets: 47471
Tweets filtered by summary: 10927
Tweets filtered by name: 13527
Tweets filtered by name and summary: 4015




In [21]:
print(len(df_tweets[df_tweets['score'] > 0.65]))
df_tweets[df_tweets['score'] > 0.65][['actor_screen_name', 'actor_name', 'actor_summary']].head(1000)

11268


Unnamed: 0,actor_screen_name,actor_name,actor_summary
0,shannon_jaeger,Shannon Jaeger,"Software developer from websites, to games, to..."
3,Oscar_Gomez58,Oscar,Gods #1 Donna Neri is my one true love ❤️ IG:O...
7,GunJoyWebsite,Gun Joy,http://GunJoy.com is a website dedicated to th...
13,ttopjr,Tuff,PHS Football #44\nPHS Baseball #16
15,shadowwww_,M.Robinson,"My grind came from seeing my mama struggle , L..."
21,pfnickel,Peter Nickel,"Insurance and Technology, beer leaguer, bbqer,..."
22,royjulie4,Tina Morin,Be careful who you call your friends. I'd rath...
33,That_mexican_BV,Brandon Villarreal,someone who can't sacrifice anything can never...
34,allentharp,Allen Tharp,"As the CEO and founder, I am currently involve..."
41,clerkinj,John Clerkin,"Designer for Qkr! at MasterCard Labs, Craft be..."


##Saving heuristically trainned 'person' classified data

The data persisted in the 'heuristic_trained.csv' is not manually evaluated, we have to check the precision of the heuristic process

In [23]:
df_tweets.to_csv('data/csv/heuristic_trained_person.csv')