In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
def load_csv_as_df(file_name, sub_directories, col_name=None):
    '''
    Load any csv as a pandas dataframe. Provide the filename, the subdirectories, and columns to read(if desired).
    '''
    # sub_directories = '/Data/'
    base_path = os.getcwd()
    full_path = base_path + sub_directories + file_name
    
    if col_name is not None:
        return pd.read_csv(full_path, usecols=[col_name])
    
    # print('Full Path: ', full_path)
    return pd.read_csv(full_path, header=0)

def describe_bots(df, return_dfs=False, for_timeline=False):
    
    if for_timeline:
        df = df.drop_duplicates(subset='user_id', keep='last')
        bot_df = df[df.user_cap >= 0.53]
        human_df = df[df.user_cap < 0.4]
        removed_df = df[(df['user_cap'] >= 0.4) & (df['user_cap'] < 0.53)]
    else:
        bot_df = df[df.cap >= 0.53]
        human_df = df[df.cap < 0.4]
        removed_df = df[(df['cap'] >= 0.4) & (df['cap'] < 0.53)]
  
    bot_percent = len(bot_df)/len(df) * 100
    human_percent = len(human_df)/len(df) * 100
    removed_percent = len(removed_df)/len(df) * 100
    
    total_num_tweets = df['tweet_count'].sum()
    total_bot_tweets = bot_df['tweet_count'].sum()
    percent_bot_tweets = (total_bot_tweets / total_num_tweets) * 100
    

    print('There are ', len(df), 'total records')
    print('There are ', len(bot_df), 'Bots in these records')
    print('There are a total of ', str(total_num_tweets), ' tweets sent')
    print('Bots sent ', str(total_bot_tweets), ' tweets or ', str(round(percent_bot_tweets, 2)), '% of tweets')
    print('Percentage of total accounts that are bots = ' + str(round(bot_percent, 2)) + '%')
    print('Percentage of total accounts that are humans = ' + str(round(human_percent, 2)) + '%')
    print('Percentage of total accounts that were removed = ' + str(round(removed_percent, 2)) + '%')
    print('Average account age in days ', df['age_in_days'].mean())
    print('Average bot account age in days ', bot_df['age_in_days'].mean())
    print('Average human account age in days ', human_df['age_in_days'].mean())
    print('Average overall CAP score ', df['cap'].mean())
    print('Average human CAP score ', human_df['cap'].mean())
    print('Average bot CAP score ', bot_df['cap'].mean())
    
    
    if return_dfs:
        return bot_df, human_df, removed_df
    
def show_number_of_errors(df1, df2):
    df = pd.concat([df1, df2])
    print('Total number of accounts where errors: ', len(df))
    return df

In [3]:
before_df = load_csv_as_df('MasterIDs-3.csv', '/Data/Master-Data/')
kav_only_df = load_csv_as_df('Kav-MasterIDs.csv', '/Data/Master-Data/')
almost_full_df = load_csv_as_df('MasterIDs-4.csv', '/Data/Master-Data/')

full_df = pd.concat([kav_only_df, almost_full_df])
# full_df = calculate_account_age_in_days(full_df)

In [4]:
error_df1 = load_csv_as_df('ErrorIDs-4.csv', '/Data/Master-Data/')
error_df2 = load_csv_as_df('Kav-ErrorIDs.csv', '/Data/Master-Data/')
erro_df = show_number_of_errors(error_df1, error_df2)

Total number of accounts where errors:  24105


In [5]:
# full_df.head()

In [12]:
all_bots = full_df[full_df.cap >= 0.53]

In [13]:
all_bots['user_id'] = all_bots['user_id'].astype('int64')
# print(all_bots['user_id'].tolist())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [14]:
all_bots.head()

Unnamed: 0,user_id,bot_score,cap,tweet_count,tweet_time,tweet_text,user_favourites_count,user_statuses_count,user_description,user_location,...,user_friends_count,user_default_profile,user_name,user_lang,user_screen_name,user_geo_enabled,user_profile_background_color,user_profile_image_url,user_time_zone,user_listed_count.1
22,1007607706847346688,4.4,0.661264,1,2018-09-26 03:28:47,น่าอิจฉาสุดๆ!! 3 ราศีใดที่ช่วงนี้จะมีโชคและได้...,3,3,,,...,8,True,กาญจนา ยุทธิวัจน์,th,1KNpMAYEny2440K,True,F5F8FA,http://abs.twimg.com/sticky/default_profile_im...,,0
37,921000049684856832,4.6,0.773905,1,2018-09-26 05:08:38,"RT @_GenUnlimited: ""I want to hear your voice....",379,1271,เมนหลักจีม เหนือเมนน้องกุก เหนือของเหนือเมนเจ้...,,...,384,True,เจ้าหญิงของบังทัน,th,jiminbtsvvvv,False,F5F8FA,http://pbs.twimg.com/profile_images/1044548320...,,0
49,986216409515479040,4.4,0.685259,1,2018-09-26 05:08:41,"RT @UNICEF: ""I want to hear your voice... No m...",9,429,ติ่งเกือบทุกวง แต่จะมั่นคงกับbangtan,,...,137,True,Numjoon,th,Numjoon_S2,False,F5F8FA,http://pbs.twimg.com/profile_images/9862206589...,,0
61,971803466732421120,4.7,0.845413,1,2018-09-26 05:08:46,RT @CNN: UN members laughed during President T...,37,2,,,...,24,True,Sarracenial,en,Sarracenial10,False,F5F8FA,http://abs.twimg.com/sticky/default_profile_im...,,0
84,1033348450631352320,4.4,0.685259,2,2018-09-26 05:08:52,RT @UNICEF: To everyone who watched today's ev...,880,270,Fan Account @BTS_twt,Việt Nam,...,31,True,thuw,vi,thuw19,False,F5F8FA,http://pbs.twimg.com/profile_images/1033355052...,,0


In [15]:
print(len(all_bots))

22808


In [16]:
initial_size = len(all_bots)
all_bots = all_bots.sort_values('cap')
all_bots = all_bots.drop_duplicates('user_id', keep='last')
new_size = len(all_bots)

print('Removed ', initial_size - new_size, ' duplicates!')

Removed  1390  duplicates!


In [17]:
print(len(all_bots))

21418


In [26]:
print(list(all_bots))

['user_id', 'bot_score', 'cap', 'tweet_count', 'tweet_time', 'tweet_text', 'user_favourites_count', 'user_statuses_count', 'user_description', 'user_location', 'user_created_at', 'user_verified', 'user_following', 'user_url', 'user_listed_count', 'user_followers_count', 'user_default_profile_image', 'user_utc_offset', 'user_friends_count', 'user_default_profile', 'user_name', 'user_lang', 'user_screen_name', 'user_geo_enabled', 'user_profile_background_color', 'user_profile_image_url', 'user_time_zone', 'user_listed_count.1']


In [27]:
column_list = ['tweet_text', 'tweet_time', 'user_id', 'tweet_count']
all_bots = all_bots[column_list]

In [28]:
print(len(all_bots))

22808


In [29]:
all_bots.to_csv('BotsToRecheck.csv', encoding='utf-8')

In [6]:
all_bot_ids = all_bots['user_id'].tolist()

In [7]:
profiles = full_df[full_df['user_id'].isin(all_bot_ids)]

In [9]:
weird = profiles[profiles.cap < 0.53]

In [10]:
weird.shape

(672, 28)

In [11]:
all_bots = full_df[full_df.cap >= 0.53]

In [12]:
wtf = all_bots[all_bots.cap < 0.53]
wtf.shape

(0, 28)

In [None]:
all_bot_ids = all_bots['user_id'].tolist()

In [14]:
original_size = len(full_df)

# Drop duplicate ids since we only need to get the user data once
df = full_df.drop_duplicates('user_id', keep='last')
unique_size = len(df)
print('Out of ', original_size, ' tweets there were ', (original_size - unique_size), ' duplicate ID\'s')






Out of  717422  tweets there were  21895  duplicate ID's
