### Import Data

In [1]:
# paths
ROOT_DIR = "caverlee-2011/social_honeypot_icwsm_2011/"

In [2]:
# read data
import pandas as pd

ham_users = pd.read_csv(ROOT_DIR + "legitimate_users.txt",
                         sep='\t',
                         names = ['UserID',
                                  'CreatedAt',
                                  'CollectedAt',
                                  'NumberOfFollowings',
                                  'NumberOfFollowers',
                                  'NumberOfTweets',
                                  'LengthOfScreenName',
                                  'LengthOfDescriptionInUserProfile'])
ham_tweets = pd.read_csv(ROOT_DIR + "legitimate_users_tweets.txt",
                                       sep='\t',
                                       names = ['UserID',
                                                'TweetID',
                                                'Tweet',
                                                'CreatedAt'])

ham_followings = pd.read_csv(ROOT_DIR + "legitimate_users_followings.txt",
                                       sep='\t',
                                       names = ['UserID',
                                                'SeriesOfNumberOfFollowings'])


spam_users = pd.read_csv(ROOT_DIR + "content_polluters.txt",
                         sep='\t',
                         names = ['UserID',
                                  'CreatedAt',
                                  'CollectedAt',
                                  'NumberOfFollowings',
                                  'NumberOfFollowers',
                                  'NumberOfTweets',
                                  'LengthOfScreenName',
                                  'LengthOfDescriptionInUserProfile'])
spam_tweets = pd.read_csv(ROOT_DIR + "content_polluters_tweets.txt",
                                       sep='\t',
                                       names = ['UserID',
                                                'TweetID',
                                                'Tweet',
                                                'CreatedAt'])
spam_followings = pd.read_csv(ROOT_DIR + "content_polluters_followings.txt",
                                       sep='\t',
                                       names = ['UserID',
                                                'SeriesOfNumberOfFollowings'])
       

### Feature Extraction

Features:

(User demographics)
1. Length of Screen Name (given)
2. Length of Description (given)
3. Longevity (calculate)

(User Friendship Networks)
1. Number of following (given)
2. Number of followers (given)
3. Ratio of Number of following and followers (calculate)
4. Percentage of Bidirectional Friends (missing)
5. Standard Deviation of Unique numerical IDs of following (missing)
6. standard deviation of unique numerical IDs of followers (missing)

(User Content)
1. the number of posted tweets (given)
2. number of posted tweets per day (calculate)
3. |links|in tweets /|tweets| (calculate)
4. |unique links|in tweets /|tweets| (calculate)
5. |@username|in tweets /|tweets| (calculate)
6. |unique@username|in tweets /|tweets| (calculate)
7. Average Content Similarity over all pairs of tweets posted by a user (missing, don't know similarity metric)
8. ZIP compression ratio of posted tweets (calculate)

(User History)
1. Change rate of number of following (calculate)

#### Given Features
Length of Screen Name, Length of User Profile Description, Number of Following, Number of Followers, Number of Tweets

In [3]:
import numpy as np
# given features
ham_users_input = ham_users[['UserID',
                             'LengthOfScreenName',
                            'LengthOfDescriptionInUserProfile',
                           'NumberOfFollowings',
                           'NumberOfFollowers',
                           'NumberOfTweets']]

spam_users_input = spam_users[['UserID',
                             'LengthOfScreenName',
                            'LengthOfDescriptionInUserProfile',
                           'NumberOfFollowings',
                           'NumberOfFollowers',
                           'NumberOfTweets']]

Extract number of Tweets recorded in the database per user. This will come in handy for calculations later down the line.

In [4]:
# get tweet counts by user
ham_tweet_counts = ham_tweets.groupby('UserID').size().reset_index()
ham_tweet_counts.columns = ['UserID','RecordedTweetCount']
ham_users = pd.merge(ham_users, ham_tweet_counts, on='UserID', how='left')

spam_tweet_counts = spam_tweets.groupby('UserID').size().reset_index()
spam_tweet_counts.columns = ['UserID','RecordedTweetCount']
spam_users = pd.merge(spam_users, spam_tweet_counts, on='UserID', how='left')

#### Longevity

In [5]:
# add longevity
from datetime import datetime
date_format = '%Y-%m-%d %H:%M:%S'

def get_longevity(start,end):
    start_date = datetime.strptime(start, date_format)
    end_date = datetime.strptime(end, date_format)
    return (end_date - start_date).days

ham_users_input['Longevity'] = ham_users.apply(lambda row: get_longevity(row['CreatedAt'],row['CollectedAt']), axis=1)

spam_users_input['Longevity'] = spam_users.apply(lambda row: get_longevity(row['CreatedAt'],row['CollectedAt']), axis=1)

#### Ratio of Following to Followers

In [6]:
# add ratio of followings to followers
ham_users_input['FollowRatio'] = ham_users_input['NumberOfFollowings'] / ham_users_input['NumberOfFollowers']
# replace divide by zero errors with 0
ham_users_input['FollowRatio'] = ham_users_input['FollowRatio'].replace([np.inf, -np.inf], np.nan).fillna(0)

# add ratio of followings to followers
spam_users_input['FollowRatio'] = spam_users_input['NumberOfFollowings'] / spam_users_input['NumberOfFollowers']
# replace divide by zero errors with 0
spam_users_input['FollowRatio'] = spam_users_input['FollowRatio'].replace([np.inf, -np.inf], np.nan).fillna(0)

#### Tweets Posted Per Day

In [7]:
# add number of posted tweets per day
ham_users_input['TweetsPerDay'] = ham_users_input['NumberOfTweets'] / ham_users_input['Longevity']
# replace divide by zero errors with 0
ham_users_input['TweetsPerDay'] = ham_users_input['TweetsPerDay'].replace([np.inf, -np.inf], np.nan).fillna(0)

# add number of posted tweets per day
spam_users_input['TweetsPerDay'] = spam_users_input['NumberOfTweets'] / spam_users_input['Longevity']
# replace divide by zero errors with 0
spam_users_input['TweetsPerDay'] = spam_users_input['TweetsPerDay'].replace([np.inf, -np.inf], np.nan).fillna(0)

#### Average Links Per Tweet

In [8]:
# add average links per tweet
import re

# function for counting number of links in tweet
def count_links(text):
    return len(re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', str(text)))

# add link count to tweets dataframe
ham_tweets['LinkCount'] = ham_tweets['Tweet'].apply(lambda x: count_links(x))
# get sum of links by user
ham_sum_links = ham_tweets.groupby('UserID')['LinkCount'].sum().reset_index()
ham_sum_links.columns = ['UserID','LinkSum']
# add link average to features
ham_users_input['LinkAverage'] = ham_sum_links['LinkSum'] / ham_users['RecordedTweetCount']
# replace divide by zero errors with 0
ham_users_input['LinkAverage'] = ham_users_input['LinkAverage'].replace([np.inf, -np.inf], np.nan).fillna(0)

# add link count to tweets dataframe
spam_tweets['LinkCount'] = spam_tweets['Tweet'].apply(lambda x: count_links(x))
# get sum of links by user
spam_sum_links = spam_tweets.groupby('UserID')['LinkCount'].sum().reset_index()
spam_sum_links.columns = ['UserID','LinkSum']
# add link average to features
spam_users_input['LinkAverage'] = spam_sum_links['LinkSum'] / spam_users['RecordedTweetCount']
# replace divide by zero errors with 0
spam_users_input['LinkAverage'] = spam_users_input['LinkAverage'].replace([np.inf, -np.inf], np.nan).fillna(0)

#### Average Unique Links Per Tweet

In [9]:
# add average unique links per tweet

# function for finding links in tweet
def get_links(text):
    return set(re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', str(text)))

# function for merging sets
def merge_sets(sets):
    return set.union(*sets)

# add unique links to tweets dataframe
ham_tweets['UniqueLinks'] = ham_tweets['Tweet'].apply(lambda x: get_links(x))
# get unique links by user
ham_unique_links = ham_tweets.groupby('UserID')['UniqueLinks'].agg(merge_sets).reset_index()
ham_unique_links.columns = ['UserID','UniqueLinks']
ham_unique_links['UniqueLinkSum'] = ham_unique_links['UniqueLinks'].apply(lambda x: len(x))
# add unique link average to features
ham_users_input['UniqueLinkAverage'] = ham_unique_links['UniqueLinkSum'] / ham_users['RecordedTweetCount']
# replace divide by zero errors with 0
ham_users_input['UniqueLinkAverage'] = ham_users_input['UniqueLinkAverage'].replace([np.inf, -np.inf], np.nan).fillna(0)

# add unique links to tweets dataframe
spam_tweets['UniqueLinks'] = spam_tweets['Tweet'].apply(lambda x: get_links(x))
# get unique links by user
spam_unique_links = spam_tweets.groupby('UserID')['UniqueLinks'].agg(merge_sets).reset_index()
spam_unique_links.columns = ['UserID','UniqueLinks']
spam_unique_links['UniqueLinkSum'] = spam_unique_links['UniqueLinks'].apply(lambda x: len(x))
# add unique link average to features
spam_users_input['UniqueLinkAverage'] = spam_unique_links['UniqueLinkSum'] / ham_users['RecordedTweetCount']
# replace divide by zero errors with 0
spam_users_input['UniqueLinkAverage'] = spam_users_input['UniqueLinkAverage'].replace([np.inf, -np.inf], np.nan).fillna(0)

#### Average Mentions Per Tweet

In [10]:
# add average mentions per tweet

def count_mentions(text):
    return len(re.findall("@([a-zA-Z0-9]{1,15})", str(text)))

# add username count to tweets dataframe
ham_tweets['MentionCount'] = ham_tweets['Tweet'].apply(lambda x: count_mentions(x))
# get sum of mentions by user
ham_sum_mentions = ham_tweets.groupby('UserID')['MentionCount'].sum().reset_index()
ham_sum_mentions.columns = ['UserID','MentionSum']
# add mention average to features
ham_users_input['MentionAverage'] = ham_sum_mentions['MentionSum'] / ham_users['RecordedTweetCount']
# replace divide by zero errors with 0
ham_users_input['MentionAverage'] = ham_users_input['MentionAverage'].replace([np.inf, -np.inf], np.nan).fillna(0)

# add username count to tweets dataframe
spam_tweets['MentionCount'] = spam_tweets['Tweet'].apply(lambda x: count_mentions(x))
# get sum of mentions by user
spam_sum_mentions = spam_tweets.groupby('UserID')['MentionCount'].sum().reset_index()
spam_sum_mentions.columns = ['UserID','MentionSum']
# add mention average to features
spam_users_input['MentionAverage'] = spam_sum_mentions['MentionSum'] / spam_users['RecordedTweetCount']
# replace divide by zero errors with 0
spam_users_input['MentionAverage'] = spam_users_input['MentionAverage'].replace([np.inf, -np.inf], np.nan).fillna(0)

#### Average Unique Mentions Per Tweet

In [11]:
# add average unique mentions per tweet

# function for finding mentions in tweet
def get_mentions(text):
    return set(re.findall("@([a-zA-Z0-9]{1,15})", str(text)))

# add unique mentions to tweets dataframe
ham_tweets['UniqueMentions'] = ham_tweets['Tweet'].apply(lambda x: get_mentions(x))
# get unique mentions by user
ham_unique_mentions = ham_tweets.groupby('UserID')['UniqueMentions'].agg(merge_sets).reset_index()
ham_unique_mentions.columns = ['UserID','UniqueMentions']
ham_unique_mentions['UniqueMentionSum'] = ham_unique_mentions['UniqueMentions'].apply(lambda x: len(x))
# add unique mention average to features
ham_users_input['UniqueMentionAverage'] = ham_unique_mentions['UniqueMentionSum'] / ham_users['RecordedTweetCount']
# replace divide by zero errors with 0
ham_users_input['UniqueMentionAverage'] = ham_users_input['UniqueMentionAverage'].replace([np.inf, -np.inf], np.nan).fillna(0)

# add unique mentions to tweets dataframe
spam_tweets['UniqueMentions'] = spam_tweets['Tweet'].apply(lambda x: get_mentions(x))
# get unique mentions by user
spam_unique_mentions = spam_tweets.groupby('UserID')['UniqueMentions'].agg(merge_sets).reset_index()
spam_unique_mentions.columns = ['UserID','UniqueMentions']
spam_unique_mentions['UniqueMentionSum'] = spam_unique_mentions['UniqueMentions'].apply(lambda x: len(x))
# add unique mention average to features
spam_users_input['UniqueMentionAverage'] = spam_unique_mentions['UniqueMentionSum'] / spam_users['RecordedTweetCount']
# replace divide by zero errors with 0
spam_users_input['UniqueMentionAverage'] = spam_users_input['UniqueMentionAverage'].replace([np.inf, -np.inf], np.nan).fillna(0)

#### Zip Compression Ratio

In [12]:
# add zip compression ratio

import zlib

# get compression ratio of tweets
def get_compression_ratio(text):
    data=text.encode('utf-8')
    compressed_data = zlib.compress(data)
    return len(data)/len(compressed_data)

# concatenate tweets by user
ham_concat_tweets = ham_tweets.groupby('UserID')['Tweet'].agg(lambda x: ' '.join(str(x))).reset_index()
ham_concat_tweets.columns = ['UserID','Concat_Tweets']
# calculate compression ratio from concatenated tweets
ham_concat_tweets['Compression_Ratio'] = ham_concat_tweets['Concat_Tweets'].apply(lambda x: get_compression_ratio(x))
# add to features
ham_users_input = pd.merge(ham_users_input, ham_concat_tweets[['UserID','Compression_Ratio']], on='UserID', how='left')
# give users with no tweets compression ratio of 0
ham_users_input['Compression_Ratio'] = ham_users_input['Compression_Ratio'].fillna(0)

# concatenate tweets by user
spam_concat_tweets = spam_tweets.groupby('UserID')['Tweet'].agg(lambda x: ' '.join(str(x))).reset_index()
spam_concat_tweets.columns = ['UserID','Concat_Tweets']
# calculate compression ratio from concatenated tweets
spam_concat_tweets['Compression_Ratio'] = spam_concat_tweets['Concat_Tweets'].apply(lambda x: get_compression_ratio(x))
# add to features
spam_users_input = pd.merge(spam_users_input, spam_concat_tweets[['UserID','Compression_Ratio']], on='UserID', how='left')
# give users with no tweets compression ratio of 0
spam_users_input['Compression_Ratio'] = spam_users_input['Compression_Ratio'].fillna(0)

#### Following Change Rate

In [13]:
# add following change rate
import math

# calculate change rate based on sequence string
def calculate_change_rate(seq_str):
    seq = [int(x) for x in seq_str.split(",")]
    n = len(seq)
    # perform summation
    total =0
    for i in range(0,n-1):
        total += seq[i+1] - seq[i]
    return total/(n-1)
#     return math.sqrt(total/(n-1))

# calculate following change rate by user
ham_followings['FollowingChangeRate'] = ham_followings['SeriesOfNumberOfFollowings'].apply(lambda x: calculate_change_rate(x))
# add to features
ham_users_input = pd.merge(ham_users_input, ham_followings[['UserID','FollowingChangeRate']], on='UserID', how='left')
# fill in zeroes
ham_users_input['FollowingChangeRate'] = ham_users_input['FollowingChangeRate'].fillna(0)

# calculate following change rate by user
spam_followings['FollowingChangeRate'] = spam_followings['SeriesOfNumberOfFollowings'].apply(lambda x: calculate_change_rate(x))
# add to features
spam_users_input = pd.merge(spam_users_input, spam_followings[['UserID','FollowingChangeRate']], on='UserID', how='left')
# fill in zeroes
spam_users_input['FollowingChangeRate'] = spam_users_input['FollowingChangeRate'].fillna(0)

### Data Preparation

In [14]:
# add labels
ham_users_input['label'] = 0 
spam_users_input['label'] = 1

In [15]:
# combine dataframes
df = pd.concat([ham_users_input,spam_users_input])
# drop userID
df.drop(['UserID'], axis=1, inplace=True)

In [16]:
# separate features from labels
X = df.drop('label', axis = 1)
y = df['label']

In [17]:
# split train/test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

### Training Classifier

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# train RF
clf = RandomForestClassifier(random_state=23)
clf.fit(X_train, y_train)

RandomForestClassifier(random_state=23)

### Testing Classifer

In [19]:
# predict on test set
y_pred = clf.predict(X_test)

# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.95
