In [73]:
import os
import pandas as pd
import nltk
import numpy as np
import pickle
from random import randint

In [93]:
def load_csv_file(file_name, col_name=None):
    sub_directories = '/Data/Profile-Data/'
    base_path = os.getcwd()
    full_path = base_path + sub_directories + file_name
    
    if col_name is not None:
        return pd.read_csv(full_path, usecols=[col_name])
    
    # print('Full Path: ', full_path)
    return pd.read_csv(full_path, header=0)


def map_russian_bots_header(df):
    df['id'].fillna(0, inplace=True)
    df['id'] = df.id.astype('int64')
    
    df.columns = ['user_id', 'user_location', 'user_name', 'user_followers_count', 'user_statuses_count', 
                  'user_time_zone', 'user_verified', 'user_lang', 'user_screen_name', 'user_description',
                  'user_created_at', 'user_favourites_count', 'user_friends_count', 'user_listed_count']
    
    df = add_missing_columns_to_russian_bots(df)
    return df

def clean_my_bot_data(df):
    df.drop(['bot_score', 'cap', 'tweet_count', 'tweet_time', 'tweet_text'], inplace=True, axis=1, errors='ignore')
    df.drop(['user_following', 'user_url', 'cleaned_description', 'political_word_count'], inplace=True, axis=1, errors='ignore')
    df.drop(['user_profile_image_url', 'user_profile_image_url'], inplace=True, axis=1, errors='ignore')
    df.drop(['user_profile_background_color', 'user_utc_offset', 'user_listed_count.1'], inplace=True, axis=1, errors='ignore')
    df.drop(['bot_score', 'cap', 'tweet_count', 'tweet_time', 'tweet_text'], inplace=True, axis=1, errors='ignore')
    return df


def print_list(data):
    for x in data:
        print(x)
        

def check_headers(russian_df, my_df):
    my_header = set(list(my_df.columns.values))
    their_header = set(list(russian_df.columns.values))
    
    no_match = my_header.symmetric_difference(their_header)
    print('The following columns are still a problem: ', no_match)
    return

def add_missing_columns_to_russian_bots(df):
    df['user_default_profile_image'] = False
    
    def_profile_col = []
    geo_enabled_col = []
    
    for i in range(len(df)):
        random = randint(0, 100)
        
        if random >= 84:
            geo_enabled_col.append(True)
        else:
            geo_enabled_col.append(False)
            
        if random < 75:
            def_profile_col.append(False)
        else:
            def_profile_col.append(True)
        
    df['user_default_profile'] = def_profile_col
    df['user_geo_enabled'] = geo_enabled_col
    
    return df


def save_df_to_csv(df, file_name):
    file_name += '.csv'
    df.to_csv(file_name, encoding='utf-8', index=False)
    

In [89]:
russian_bots = load_csv_file('OfficialRussianBotUsers.csv')
my_bots = load_csv_file('political_bot_profiles(1-word).csv')

In [91]:
russian_bots = map_russian_bots_header(russian_bots)
my_bots = clean_my_bot_data(my_bots)

In [92]:
check_headers(russian_bots, my_bots) # Should be an empty set!

The following columns are still a problem:  set()


In [94]:
combined = pd.concat([russian_bots, my_bots])
print(len(combined))

1517


In [95]:
save_df_to_csv(combined, 'Initial-Combined-Data')