### Import Data

In [1]:
# paths
ROOT_DIR = "caverlee-2011/social_honeypot_icwsm_2011/"

In [2]:
# read data
import pandas as pd
spam_users = pd.read_csv(ROOT_DIR + "content_polluters.txt",
                         sep='\t',
                         names = ['UserID',
                                  'CreatedAt',
                                  'CollectedAt',
                                  'NumberOfFollowings',
                                  'NumberOfFollowers',
                                  'NumberOfTweets',
                                  'LengthOfScreenName',
                                  'LengthOfDescriptionInUserProfile'])
spam_tweets = pd.read_csv(ROOT_DIR + "content_polluters_tweets.txt",
                                       sep='\t',
                                       names = ['UserID',
                                                'TweetID',
                                                'Tweet',
                                                'CreatedAt'])
# spam_followings = pd.read_csv(ROOT_DIR + "content_polluters_followings.txt",
#                                        sep='\t',
#                                        names = ['UserID',
#                                                 'SeriesOfNumberOfFollowings'])

ham_users = pd.read_csv(ROOT_DIR + "legitimate_users.txt",
                         sep='\t',
                         names = ['UserID',
                                  'CreatedAt',
                                  'CollectedAt',
                                  'NumberOfFollowings',
                                  'NumberOfFollowers',
                                  'NumberOfTweets',
                                  'LengthOfScreenName',
                                  'LengthOfDescriptionInUserProfile'])
ham_tweets = pd.read_csv(ROOT_DIR + "legitimate_users_tweets.txt",
                                       sep='\t',
                                       names = ['UserID',
                                                'TweetID',
                                                'Tweet',
                                                'CreatedAt'])

### Feature Extraction

Features:

(User demographics)
1. Length of Screen Name (given)
2. Length of Description (given)
3. Longevity (calculate)

(User Friendship Networks)
1. Number of following (given)
2. Number of followers (given)
3. Ratio of Number of following and followers (calculate)
4. Percentage of Bidirectional Friends (missing)
5. Standard Deviation of Unique numerical IDs of following (missing)
6. standard deviation of unique numerical IDs of followers (missing)

(User Content)
1. the number of posted tweets (given)
2. number of posted tweets per day (calculate)
3. |links|in tweets /|tweets| (calculate)
4. |unique links|in tweets /|tweets| (calculate)
5. |@username|in tweets /|tweets| (calculate)
6. |unique@username|in tweets /|tweets| (calculate)
7. Average Content Similarity over all pairs of tweets posted by a user (missing, don't know similarity metric)
8. ZIP compression ratio of posted tweets (calculate, hard)

(User History)
1. Change rate of number of following (calcualte)

In [3]:
len(ham_users['UserID'].unique())

19276

In [4]:
import numpy as np
# given features
ham_users_input = ham_users[['UserID',
                             'LengthOfScreenName',
                            'LengthOfDescriptionInUserProfile',
                           'NumberOfFollowings',
                           'NumberOfFollowers',
                           'NumberOfTweets']]

In [34]:
# get tweet counts by user
tweet_counts = ham_tweets.groupby('UserID').size().reset_index()
tweet_counts.columns = ['UserID','RecordedTweetCount']
ham_users = pd.merge(ham_users, tweet_counts, on='UserID', how='left')

In [5]:
# add longevity
from datetime import datetime
date_format = '%Y-%m-%d %H:%M:%S'

def get_longevity(start,end):
    start_date = datetime.strptime(start, date_format)
    end_date = datetime.strptime(end, date_format)
    return (end_date - start_date).days

ham_users_input['Longevity'] = ham_users.apply(lambda row: get_longevity(row['CreatedAt'],row['CollectedAt']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ham_users_input['Longevity'] = ham_users.apply(lambda row: get_longevity(row['CreatedAt'],row['CollectedAt']), axis=1)


In [6]:
# add ratio of followings to followers
ham_users_input['FollowRatio'] = ham_users_input['NumberOfFollowings'] / ham_users_input['NumberOfFollowers']
# replace divide by zero errors with 0
ham_users_input['FollowRatio'] = ham_users_input['FollowRatio'].replace([np.inf, -np.inf], np.nan).fillna(0)

In [7]:
# add number of posted tweets per day
ham_users_input['TweetsPerDay'] = ham_users_input['NumberOfTweets'] / ham_users_input['Longevity']
# replace divide by zero errors with 0
ham_users_input['TweetsPerDay'] = ham_users_input['TweetsPerDay'].replace([np.inf, -np.inf], np.nan).fillna(0)

In [48]:
# add average links per tweet
import re

# function for counting number of links in tweet
def count_links(text):
    return len(re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', str(text)))

# add link count to tweets dataframe
ham_tweets['LinkCount'] = ham_tweets['Tweet'].apply(lambda x: count_links(x))

# get sum of links by user
sum_links = ham_tweets.groupby('UserID')['LinkCount'].sum().reset_index()
sum_links.columns = ['UserID','LinkSum']

# add link average to features
ham_users_input['LinkAverage'] = sum_links['LinkSum'] / ham_users['RecordedTweetCount']

# replace divide by zero errors with 0
ham_users_input['LinkAverage'] = ham_users_input['LinkAverage'].replace([np.inf, -np.inf], np.nan).fillna(0)

In [49]:
# add average unique links per tweet

# function for finding links in tweet
def get_links(text):
    return set(re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', str(text)))

# add unique links to tweets dataframe
ham_tweets['UniqueLinks'] = ham_tweets['Tweet'].apply(lambda x: get_links(x))

def merge_sets(sets):
    return set.union(*sets)

# get unique links by user
unique_links = ham_tweets.groupby('UserID')['UniqueLinks'].agg(merge_sets).reset_index()
unique_links.columns = ['UserID','UniqueLinks']
unique_links['UniqueLinkSum'] = unique_links['UniqueLinks'].apply(lambda x: len(x))

# add unique link average to features
ham_users_input['UniqueLinkAverage'] = unique_links['UniqueLinkSum'] / ham_users['RecordedTweetCount']

# replace divide by zero errors with 0
ham_users_input['UniqueLinkAverage'] = ham_users_input['UniqueLinkAverage'].replace([np.inf, -np.inf], np.nan).fillna(0)

In [58]:
# add average mentions per tweet

def count_mentions(text):
    return len(re.findall("@([a-zA-Z0-9]{1,15})", str(text)))

# add username count to tweets dataframe
ham_tweets['MentionCount'] = ham_tweets['Tweet'].apply(lambda x: count_mentions(x))

# get sum of mentions by user
sum_mentions = ham_tweets.groupby('UserID')['MentionCount'].sum().reset_index()
sum_mentions.columns = ['UserID','MentionSum']

# add mention average to features
ham_users_input['MentionAverage'] = sum_mentions['MentionSum'] / ham_users['RecordedTweetCount']

# replace divide by zero errors with 0
ham_users_input['MentionAverage'] = ham_users_input['MentionAverage'].replace([np.inf, -np.inf], np.nan).fillna(0)

In [59]:
# add average unique mentions per tweet

# function for finding mentions in tweet
def get_mentions(text):
    return set(re.findall("@([a-zA-Z0-9]{1,15})", str(text)))

# add unique mentions to tweets dataframe
ham_tweets['UniqueMentions'] = ham_tweets['Tweet'].apply(lambda x: get_mentions(x))

# get unique mentions by user
unique_mentions = ham_tweets.groupby('UserID')['UniqueMentions'].agg(merge_sets).reset_index()
unique_mentions.columns = ['UserID','UniqueMentions']
unique_mentions['UniqueMentionSum'] = unique_mentions['UniqueMentions'].apply(lambda x: len(x))

# add unique mention average to features
ham_users_input['UniqueMentionAverage'] = unique_mentions['UniqueMentionSum'] / ham_users['RecordedTweetCount']

# replace divide by zero errors with 0
ham_users_input['UniqueMentionAverage'] = ham_users_input['UniqueMentionAverage'].replace([np.inf, -np.inf], np.nan).fillna(0)


In [60]:
ham_users_input

Unnamed: 0,UserID,CreatedAt,CollectedAt,NumberOfFollowings,NumberOfFollowers,NumberOfTweets,LengthOfScreenName,LengthOfDescriptionInUserProfile,RecordedTweetCount,LinkAverage,UniqueLinkAverage,MentionAverage,UniqueMentionAverage
0,614,2006-07-13 15:30:05,2009-11-20 23:56:21,510,350,3265,10,34,200.0,0.31000,0.015000,0.590000,0.310000
1,1038,2006-07-15 16:12:15,2009-11-16 05:12:11,304,443,4405,7,156,200.0,0.27500,0.100000,0.805000,0.460000
2,1437,2006-07-16 12:29:24,2009-11-16 16:25:12,45,73,725,6,37,200.0,0.09500,0.020000,0.650000,0.205000
3,2615,2006-07-19 23:23:55,2009-11-27 18:34:36,211,230,211,7,0,200.0,0.13500,0.085000,0.380000,0.215000
4,3148,2006-07-26 14:17:22,2009-11-20 17:35:18,7346,7244,11438,8,97,199.0,0.58794,0.035176,0.728643,0.487437
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19271,93390990,2009-11-29 06:34:30,2009-11-29 07:50:54,5,0,5,11,0,5.0,0.00000,0.000000,0.000000,0.000000
19272,93402679,2009-11-29 07:47:50,2009-11-29 07:56:28,20,1,1,12,0,1.0,0.00000,0.000000,0.000000,0.000000
19273,93419256,2009-11-29 09:23:30,2009-11-29 09:30:48,0,0,1,8,0,1.0,0.00000,0.000000,0.000000,0.000000
19274,93426370,2009-11-29 10:04:26,2009-11-29 10:13:17,20,1,1,10,0,1.0,0.00000,0.000000,0.000000,0.000000
