<li>Tweet similarity:   
(1) $S=\frac{\sum_{p\in P}c(p)}{l_al_p}$    
where $P$ is the set of possible tweet-to-tweet combinations among any two tweets logged for a certain account, $p$ is a single pair, $c(p)$ is a function calculation the number of words two tweets share, $l_a$ is the average length of tweets posted by that user, and $l_p$ is the number of tweet combinations. A profile sending similar tweets will have a low value of S.[4] 

In [55]:
import pandas as pd
from nltk.corpus import stopwords
from collections import Counter
import itertools as it
from math import factorial
import numpy as np

In [36]:
def create_df(user_id):
    """
    remove unnecessary words from the user_tweets.csv
    
    Argument: user_id
    
    Return: user_tweets dataframe
    """
    user_id = str(user_id)
    user_tweets = pd.read_csv(user_id + "_tweets.csv")
    special_remove = [tweets.decode('unicode_escape').encode('ascii','ignore') for tweets in user_tweets['tweet_text']] 
    user_tweets['tweet_split'] = [tweets.lower().split() for tweets in special_remove]
    user_tweets['tweet_split'] = [filter(lambda x: not (x.startswith("@") or x.startswith("#") or x.startswith("https:") or x in stopwords.words("english") or x.startswith("rt")), tweet) for tweet in user_tweets['tweet_split']]
    user_tweets['tweet_split_string'] = [' '.join(str(x) for x in tweets) for tweets in user_tweets['tweet_split']]
    return user_tweets
    

In [7]:
user_tweets = create_df(707741206416138240)

In [66]:
def cal_char(user_tweets):
    """
    calculate the length of tweets
    
    Argument: user_tweets dataframe created by create_df function
    
    Return: the length of every tweet
    """
    user_tweets['tweet_split'] = [tweets.lower().split() for tweets in user_tweets['tweet_text']]
    user_tweets['tweet_string'] = [filter(lambda x: not (x.startswith("@") or x.startswith("#") or x.startswith("https:") or x.startswith("rt")), tweet) for tweet in user_tweets['tweet_split']]
    char_count = [len(user_tweets['tweet_string'][i]) for i in range(len(user_tweets['tweet_string']))]
    return char_count

In [9]:
def comb_2(tweet_df):
    """
    calculate the number of tweet combinations
    
    Argument: num_tweets
    
    Return: total number of tweet combinations
    """
    num_tweets = len(tweet_df['tweet_text'])
    return int(factorial(num_tweets) / (factorial(2) * factorial(num_tweets - 2)))

In [10]:
def tweet_set(tweet_df):
    """
    create a set of possible tweet-to-tweet combinations among any two tweets
    
    Argument: tweet_df
    
    Return: set of tweet combinations
    """
    tweet_list = list(tweet_df["tweet_split"])
    tweet_tuples = list(it.combinations(tweet_list, 2))
    tweet_df = pd.DataFrame(tweet_tuples, columns = ["tweet_1", "tweet_2"])
    tweet_df["tweet_combination"] = tweet_df["tweet_1"] + tweet_df["tweet_2"]
    tweet_df['common_words'] = [set([x for x in tweet if tweet.count(x) > 1]) for tweet in tweet_df['tweet_combination']]
    tweet_df['common_count'] = [len(tweet_df['common_words'][i]) for i in range(len(tweet_df['common_words']))]  
    return tweet_df

In [11]:
tweet_comb = tweet_set(user_tweets)

In [71]:
def tweet_sim(user_id):
    user_tweets = create_df(user_id)
    char_count = cal_char(user_tweets)
    tweet_comb = tweet_set(user_tweets)
    sim_value = tweet_comb['common_count'].sum() / (np.mean(char_count) * comb_2(user_tweets))
    return sim_value

In [72]:
# Fake Account
tweet_sim(707741206416138240)

0.18818907302675614

In [152]:
# Genuine Account
tweet_sim(11630862)

0.11177355566759575

In [85]:
final_data = pd.read_csv(open('final data.csv', 'rU'), 
                      encoding = 'utf-8')

In [141]:
for i in range(len(final_data)):
    if final_data['label'][i] == "spam":
        print i

In [149]:
for i in range(len(final_data)):
    if final_data['id'][i] == 11630862:
        print i

9039


In [150]:
final_data['label'][9039]

u'genuine'

In [137]:
for i in range(len(final_data)):
    if final_data['id'][i] == 2731681:
        print i

11829


In [147]:
final_data['label'][11829]

u'genuine'