### Import Data

In [1]:
# paths
ROOT_DIR = "caverlee-2011/social_honeypot_icwsm_2011/"

In [14]:
# read data
import pandas as pd
spam_users = pd.read_csv(ROOT_DIR + "content_polluters.txt",
                         sep='\t',
                         names = ['UserID',
                                  'CreatedAt',
                                  'CollectedAt',
                                  'NumberOfFollowings',
                                  'NumberOfFollowers',
                                  'NumberOfTweets',
                                  'LengthOfScreenName',
                                  'LengthOfDescriptionInUserProfile'])
spam_tweets = pd.read_csv(ROOT_DIR + "content_polluters_tweets.txt",
                                       sep='\t',
                                       names = ['UserID',
                                                'TweetID',
                                                'Tweet',
                                                'CreatedAt'])
# spam_followings = pd.read_csv(ROOT_DIR + "content_polluters_followings.txt",
#                                        sep='\t',
#                                        names = ['UserID',
#                                                 'SeriesOfNumberOfFollowings'])

ham_users = pd.read_csv(ROOT_DIR + "legitimate_users.txt",
                         sep='\t',
                         names = ['UserID',
                                  'CreatedAt',
                                  'CollectedAt',
                                  'NumberOfFollowings',
                                  'NumberOfFollowers',
                                  'NumberOfTweets',
                                  'LengthOfScreenName',
                                  'LengthOfDescriptionInUserProfile'])
ham_tweets = pd.read_csv(ROOT_DIR + "legitimate_users_tweets.txt",
                                       sep='\t',
                                       names = ['UserID',
                                                'TweetID',
                                                'Tweet',
                                                'CreatedAt'])

### Feature Extraction

Features:

(User demographics)
1. Length of Screen Name (given)
2. Length of Description (given)
3. Longevity (calculate)

(User Friendship Networks)
1. Number of following (given)
2. Number of followers (given)
3. Ratio of Number of following and followers (calculate)
4. Percentage of Bidirectional Friends (missing)
5. Standard Deviation of Unique numerical IDs of following (missing)
6. standard deviation of unique numerical IDs of followers (missing)

(User Content)
1. the number of posted tweets (given)
2. number of posted tweets per day (calculate)
3. |links|in tweets /|tweets| (caluclate)
4. |unique links|in tweets /|tweets| (calculate)
5. |@username|in tweets /|tweets| (calculate)
6. |unique@username|in tweets /|tweets| (calculate)
7. Average Content Similarity over all pairs of tweets posted by a user (missing, don't know similarity metric)
8. ZIP compression ratio of posted tweets (calculate, hard)

(User History)
1. Change rate of number of following (calcualte)

In [20]:
len(ham_users['UserID'].unique())

19276

In [88]:
import numpy as np
# given features
ham_users_input = ham_users[['UserID',
                             'LengthOfScreenName',
                            'LengthOfDescriptionInUserProfile',
                           'NumberOfFollowings',
                           'NumberOfFollowers',
                           'NumberOfTweets']]

In [89]:
# add longevity
from datetime import datetime
date_format = '%Y-%m-%d %H:%M:%S'

def get_longevity(start,end):
    start_date = datetime.strptime(start, date_format)
    end_date = datetime.strptime(end, date_format)
    return (end_date - start_date).days

ham_users_input['Longevity'] = ham_users.apply(lambda row: get_longevity(row['CreatedAt'],row['CollectedAt']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ham_users_input['Longevity'] = ham_users.apply(lambda row: get_longevity(row['CreatedAt'],row['CollectedAt']), axis=1)


In [90]:
# add ratio of followings to followers
ham_users_input['FollowRatio'] = ham_users_input['NumberOfFollowings'] / ham_users_input['NumberOfFollowers']
# replace divide by zero errors with 0
ham_users_input['FollowRatio'] = ham_users_input['FollowRatio'].replace([np.inf, -np.inf], np.nan).fillna(0)

In [91]:
# add number of posted tweets per day
ham_users_input['TweetsPerDay'] = ham_users_input['NumberOfTweets'] / ham_users_input['Longevity']
# replace divide by zero errors with 0
ham_users_input['TweetsPerDay'] = ham_users_input['TweetsPerDay'].replace([np.inf, -np.inf], np.nan).fillna(0)

In [95]:
# add average links per tweet
import re

# function for counting number of links in tweet
def count_links(text):
    return len(re.findall('https?://', str(text)))

# add link count to tweets dataframe
ham_tweets['LinkCount'] = ham_tweets['Tweet'].apply(lambda x: count_links(x))

# get average links by user
average_links = ham_tweets.groupby('UserID')['LinkCount'].mean().reset_index()
average_links.columns = ['UserID','AverageNumLinks']

# merge with features
ham_users_input = pd.merge(ham_users_input, average_links, on='UserID', how='left')

Unnamed: 0,UserID,LengthOfScreenName,LengthOfDescriptionInUserProfile,NumberOfFollowings,NumberOfFollowers,NumberOfTweets,Longevity,FollowRatio,TweetsPerDay,AverageNumLinks
0,614,10,34,510,350,3265,1226,1.457143,2.663132,0.31000
1,1038,7,156,304,443,4405,1219,0.686230,3.613618,0.27500
2,1437,6,37,45,73,725,1219,0.616438,0.594750,0.09500
3,2615,7,0,211,230,211,1226,0.917391,0.172104,0.13500
4,3148,8,97,7346,7244,11438,1213,1.014081,9.429514,0.58794
...,...,...,...,...,...,...,...,...,...,...
19271,93390990,11,0,5,0,5,0,0.000000,0.000000,0.80000
19272,93402679,12,0,20,1,1,0,20.000000,0.000000,0.00000
19273,93419256,8,0,0,0,1,0,0.000000,0.000000,0.00000
19274,93426370,10,0,20,1,1,0,20.000000,0.000000,0.00000
