#Loading Data to Classification
## Census Data
The most commom names and surnames were extracted from US Census data in order to use in a heuristic to classify tweet actors into the base categories

In [1]:
import pandas as pd

female_first_name = pd.read_csv('data/names/census-dist-female-first.txt', sep='\s+', header=None)
male_first_name = pd.read_csv('data/names/census-dist-male-first.txt', sep='\s+', header=None)
surnames = pd.read_csv('data/names/surnames.txt', sep='\s+', header=None)

us_names = pd.concat([female_first_name[0], male_first_name[0], surnames[0]], ignore_index=True)
us_names = set(us_names.str.lower().tolist())
print(len(us_names))

91909


## Raw Tweet Data
The raw tweet data was exported from Kinesis to S3, where all gnip formatted data is available for download

- Transform old s3 files that were separated by spaces and transform into \n separated

In [2]:
#import json

#from os import listdir
#from os.path import isfile, join

#space_delimited_files = [ f for f in listdir("data/raw/space_delimited") if isfile(join("data/raw/space_delimited",f)) ]

#raw_tweets = None 
#for file_name in space_delimited_files:
#    with open('data/raw/space_delimited/' + file_name) as o:
#        with open('data/raw/' + file_name, "a+") as w
#            w.write("\n".join(['{' + l + '}' for l in content[1:-1].split('} {')]))

- Load json parsed files in memory to further generate necessary csv files

In [4]:
import glob
import json

json_files = glob.glob("data/raw/*-*")
tweets = []
for json_file in json_files:
    with open(json_file) as json_content:
        json_lines = json_content.read().split("}}\n")
        for l in json_lines:
            try:
                if l:
                    tweets.append(json.loads(l + "}}"))
            except:
                print('Error parsing file', json_file, 'reading json:', l + "}}") 
                raise
        print('# Parsed', len(json_lines), 'lines from', json_file)
        print('### Parsed', len(tweets), 'in total')

('# Parsed', 2024, 'lines from', 'data/raw/49551698162644073620788046595537922216082340233330294786-49551698162644073620788046597982370223343161793711177730')
('### Parsed', 2023, 'in total')
('# Parsed', 2333, 'lines from', 'data/raw/49551698162644073620788046597983579149162776422885883906-49551698162644073620788046600801585234684518260812021762')
('### Parsed', 4355, 'in total')
('# Parsed', 2117, 'lines from', 'data/raw/49551698162644073620788046600802794160504133027425681410-49551698162644073620788046603359672268989114894895808514')
('### Parsed', 6471, 'in total')
('# Parsed', 2388, 'lines from', 'data/raw/49551698162644073620788046603360881194808729661509468162-49551698162644073620788046606246587126228890733219151874')
('### Parsed', 8858, 'in total')
('# Parsed', 2143, 'lines from', 'data/raw/49551698162644073620788046606247796052048505362393858050-49551698162644073620788046608836106231843467725845299202')
('### Parsed', 11000, 'in total')
('# Parsed', 2244, 'lines from', 'data/

- Persist necessary attributes in csv files for further loading in pandas dataframe for analisys

In [8]:
import csv

with open('data/csv/extract_untrained_tweets.csv', 'w') as csv_file:
    tweets_writer = csv.writer(csv_file)
    tweets_writer.writerow([
        'actor_id',
        'actor_screen_name',
        'actor_name',
        'actor_verified',
        'actor_friends_count',
        'actor_followers_count',
        'actor_listed_count',
        'actor_statuses_count',
        'actor_favorites_count',
        'actor_summary',
        'actor_created_at',
        'actor_location',
        
        'tweet_id',
        'tweet_created_at',
        'tweet_generator',
        'tweet_body',
        'tweet_verb',
            
        'tweet_urls_count',
        'tweet_mentions_count',
        'tweet_hashtags_count',
        'tweet_trends_count',
        'tweet_symbols_count'])
    for tweet in tweets:
        tweets_writer.writerow([
                tweet['actor']['id'],
                tweet['actor']['preferredUsername'].encode('utf-8'),
                tweet['actor']['displayName'].encode('utf-8'),
                tweet['actor']['verified'],
                tweet['actor']['friendsCount'],
                tweet['actor']['followersCount'],
                tweet['actor']['listedCount'],
                tweet['actor']['statusesCount'],
                tweet['actor']['favoritesCount'],
                tweet['actor']['summary'].encode('utf-8') if tweet['actor']['summary'] else None,
                tweet['actor']['postedTime'],
                tweets[1]['actor']['location']['displayName'] if tweets[1]['actor'].get('location') else 'null',
                
                tweet['id'],
                tweet['postedTime'],
                tweet['generator']['displayName'].encode('utf-8') if tweet['generator']['displayName'] else None,
                tweet['body'].encode('utf-8'),
                tweet['verb'],
                
                len(tweet['twitter_entities']['urls']),
                len(tweet['twitter_entities']['user_mentions']),
                len(tweet['twitter_entities']['hashtags']),
                len(tweet['twitter_entities']['trends']),
                len(tweet['twitter_entities']['symbols'])
            ])


#CSV file with actor and tweet information, no aggregation

This data will be used to run a few analisys and to train items into the categories we want to buld the classifier for
- Business
- Person
- Bot

In order to execute the training we will create a few heuristics

In [9]:
df_tweets = pd.read_csv('data/csv/extract_untrained_tweets.csv')
df_tweets = df_tweets.dropna(subset=['actor_summary', 'tweet_generator'])
print(len(df_tweets))
df_tweets.head()

36873


Unnamed: 0,actor_id,actor_screen_name,actor_name,actor_verified,actor_friends_count,actor_followers_count,actor_listed_count,actor_statuses_count,actor_favorites_count,actor_summary,...,tweet_id,tweet_created_at,tweet_generator,tweet_body,tweet_verb,tweet_urls_count,tweet_mentions_count,tweet_hashtags_count,tweet_trends_count,tweet_symbols_count
0,id:twitter.com:68489753,LMCTVInfo,LMCTV,False,89,275,7,4079,228,LMC-TV is the Public Access TV station broadca...,...,"tag:search.twitter.com,2005:613358628783935492",2015-06-23T14:51:07.000Z,Twitter Web Client,RT @WIRED: Pixar's best director is also its m...,share,1,1,0,0,0
1,id:twitter.com:573951685,cannibalempath,will #savehannibal,False,359,551,18,25564,13934,20 | INFJ | male | hannigram is my lifeblood,...,"tag:search.twitter.com,2005:613358627370369024",2015-06-23T14:51:07.000Z,Twitter for iPhone,"RT @TheRAFanPage: Why not cancel The Bachelor,...",share,0,3,2,0,0
2,id:twitter.com:1044815893,rizkii_tw,Rizki Tri Wahyuni,False,116,1255,2,22447,74,Berusaha ~ Yakin & Percaya ~ Berdoa kepada yan...,...,"tag:search.twitter.com,2005:613358632957145088",2015-06-23T14:51:08.000Z,Twitter for Android,Mau gadget gratis diRamadhan thun ini? FOLLOW ...,post,0,1,1,0,0
3,id:twitter.com:2917732032,MaxSpolaor,Max Spolaor,False,36,14,7,45,3,"coder, astrophysicist, algorithm aficionado",...,"tag:search.twitter.com,2005:613358634588737536",2015-06-23T14:51:08.000Z,Twitter Web Client,RT @WIRED: This radio surveillance bug can ste...,share,1,1,0,0,0
4,id:twitter.com:2976485969,DashDashNote,Christopher Perry,False,454,171,36,6354,72,"A professional Consultative Sales Manager, foo...",...,"tag:search.twitter.com,2005:613358635108933632",2015-06-23T14:51:08.000Z,Twitter for BlackBerry,RT @TechCrunch: How Clifford Stoll Sells Klein...,share,1,2,0,0,0


## Determining Devices for Users

We believe that most posts made directly in twitter web and using phone clients should be tweets posted by persons and not businesses

In [10]:
device = df_tweets[['tweet_generator', 'tweet_id']]
posts_by_device = device.groupby('tweet_generator').count()
posts_by_device['percentage'] = (posts_by_device.tweet_id / posts_by_device.tweet_id.sum()) * 100
posts_by_device = posts_by_device[['percentage']].sort('percentage', ascending=False)

print(posts_by_device.head(10))

                     percentage
tweet_generator                
Twitter for iPhone    27.939142
Twitter Web Client    16.453774
Twitter for Android   11.523337
IFTTT                  9.755105
twitterfeed            4.119545
dlvr.it                3.682912
Hootsuite              3.468663
TweetDeck              3.124237
Twitter for iPad       1.757383
RoundTeam              1.697719


In [11]:
client_devices = device[device.tweet_generator.str.startswith('Twitter ')].groupby('tweet_generator').count()
client_devices['percentage'] = (client_devices.tweet_id / client_devices.tweet_id.sum()) * 100
client_devices = client_devices[['percentage']].sort('percentage', ascending=False)

print(client_devices)

                             percentage
tweet_generator                        
Twitter for iPhone            47.313309
Twitter Web Client            27.863507
Twitter for Android           19.514099
Twitter for iPad               2.976026
Twitter for Windows Phone      0.785340
Twitter for Mac                0.486819
Twitter for BlackBerry         0.376596
Twitter for Android Tablets    0.215854
Twitter for BlackBerry®        0.211261
Twitter Ads                    0.165335
Twitter for Windows            0.068890
Twitter for  Android           0.013778
Twitter for Apple Watch        0.009185


In [12]:
all_filtered_devices = device[device.tweet_generator.str.startswith('Twitter ')].tweet_generator.unique()
non_client_devices = ['Twitter Ads']
client_devices = [item for item in all_filtered_devices if item not in non_client_devices]
print(client_devices)

['Twitter Web Client', 'Twitter for iPhone', 'Twitter for Android', 'Twitter for BlackBerry', 'Twitter for Windows Phone', 'Twitter for iPad', 'Twitter for BlackBerry\xc2\xae', 'Twitter for Mac', 'Twitter for Android Tablets', 'Twitter for Windows', 'Twitter for Apple Watch', 'Twitter for  Android']


In [13]:
df_tweets['score'] = df_tweets.tweet_generator.map(lambda v: 0.33 if v in client_devices else 0)
filtered_by_devices_tweets = df_tweets[df_tweets.tweet_generator.isin(client_devices)]
print('Tweets filtered by posting device:', len(filtered_by_devices_tweets))
filtered_by_devices_tweets[['actor_screen_name', 'actor_summary', 'tweet_body', 'tweet_generator']].head()

('Tweets filtered by posting device:', 21738)


Unnamed: 0,actor_screen_name,actor_summary,tweet_body,tweet_generator
0,LMCTVInfo,LMC-TV is the Public Access TV station broadca...,RT @WIRED: Pixar's best director is also its m...,Twitter Web Client
1,cannibalempath,20 | INFJ | male | hannigram is my lifeblood,"RT @TheRAFanPage: Why not cancel The Bachelor,...",Twitter for iPhone
2,rizkii_tw,Berusaha ~ Yakin & Percaya ~ Berdoa kepada yan...,Mau gadget gratis diRamadhan thun ini? FOLLOW ...,Twitter for Android
3,MaxSpolaor,"coder, astrophysicist, algorithm aficionado",RT @WIRED: This radio surveillance bug can ste...,Twitter Web Client
4,DashDashNote,"A professional Consultative Sales Manager, foo...",RT @TechCrunch: How Clifford Stoll Sells Klein...,Twitter for BlackBerry


#Determine humans by ocurrence of Name and Summary Personal Description
This heuristic will check for accounts which the name of the actor is fully represented by american first names and surnames, together with a description containing "I'm" or "I am"

In [14]:
df_tweets['split_actor_name'] = df_tweets.actor_name.str.lower().str.split(' ')
df_tweets = df_tweets.dropna()

In [15]:
import re

pattern = "i'|i am|ceo|cto|cfo|cmo|editor|founder|director|programmer|curator|designer|entrepreneur|marketer|assistant|lead|experienced|developer|executive|writer|reader|speaker|adventurer|my"

print('Total Tweets:', len(df_tweets))
df_tweets['score'] += df_tweets.actor_summary.map(lambda s: 0.33 if re.match(pattern, s.lower()) else 0)
summary_filtered_tweet = df_tweets[df_tweets.actor_summary.str.lower().str.contains(pattern)]
print('Tweets filtered by summary:', len(summary_filtered_tweet))
df_tweets['score'] += df_tweets.split_actor_name.map(lambda an: 0.33 if set(an).issubset(us_names) else 0)
name_filtered_tweet = df_tweets[df_tweets.split_actor_name.apply(lambda x: set(x).issubset(us_names))]
print('Tweets filtered by name:', len(name_filtered_tweet))
summary_name_filtered_tweet = df_tweets[df_tweets.actor_summary.str.lower().str.contains(pattern)][df_tweets.split_actor_name.apply(lambda x: set(x).issubset(us_names))]
print('Tweets filtered by name and summary:', len(summary_name_filtered_tweet))

('Total Tweets:', 36873)
('Tweets filtered by summary:', 8574)
('Tweets filtered by name:', 10320)
('Tweets filtered by name and summary:', 3137)




In [16]:
print(len(df_tweets[df_tweets['score'] > 0.65]))
df_tweets[df_tweets['score'] > 0.65][['actor_screen_name', 'actor_name', 'actor_summary']].head(1000)

8482


Unnamed: 0,actor_screen_name,actor_name,actor_summary
4,DashDashNote,Christopher Perry,"A professional Consultative Sales Manager, foo..."
11,McCannSportsLaw,Michael McCann,Sports Illustrated Legal Analyst. Professor an...
24,KarenSkeen,Karen Skeen,Digital Edge Business Transformation. Mobile-F...
25,Siete_Uno,Josue,I know I was born and I know that I'll die. Th...
37,andresdavid,Andres Almeida,Communications guy working at @NASA. Makes bad...
40,dfindles,Deborah Findling,"Coffee addicted @SyracuseU Alum, technophile +..."
44,machadoroberta,Roberta Machado,"Entusiasta de quadrinhos, amante de animação e..."
54,conti_dany,Daniela Conti,Specializzata in lavori incomprensibili ai più...
57,imac_48,Ian Mackie,"i believe in happiness, smiles, family, and fr..."
63,Cody_Wayne20,Cody Alphin,Family is Everything! #TarHeels #Panthers #Hor...


##Saving heuristically trainned 'person' classified data

The data persisted in the 'heuristic_trained.csv' is not manually evaluated, we have to check the precision of the heuristic process

In [17]:
df_tweets[df_tweets['score'] > 0.65].to_csv('data/csv/person_heuristic_trained.csv')