## Data collection

I have found a [dataset](https://github.com/antibot4navalny/accounts_labelled/blob/main/labels.json) of accounts recognized as Twitter bots'. The dataset is used in Chrome extention '[MetaBot for Twitter](https://github.com/antibot4navalny/metabot)' that highlights bots on Twitter.

To get a list of account names from the JSON I simply subset even keys.

As I have collected some data before I use list comparation to update information only for new names in bot list (updatable).

In [411]:
import pandas as pd
import json

from twitter_scraper_selenium import get_profile_details

In [10]:
try:
    df = pd.read_csv('twitter_profiles_dataset.csv')
    bots_old_names = set(df.screen_name.tolist())
except:
    bots_old_names = ()
print('bots_old_names: ', len(bots_old_names))

bots_old_names:  672


In [11]:
# Turn bots json file into a dictionary and compare with an old list of bots to check for updates:
bots_url = 'https://raw.githubusercontent.com/antibot4navalny/accounts_labelled/main/labels.json'
bots_dict = pd.read_json(bots_url, typ='series')
bots_list = list(bots_dict.keys())
bots_names = bots_list[1::2]

bots_new_names = list(set(bots_names) - set(bots_old_names))

bots_new_names
print(len(bots_new_names), bots_new_names)

51 ['GordeevaSamara', 'Katheri92373666', 'KathyHo87241217', 'kostkornl', 'Rosalva23944962', 'iraannhh', 'ilushakov_', 'lavrent_pasha', 'zokuznecova', 'Jennife46249662', 'Janell66927328', 'ded_maloletnij', 'sigazasigoy', 'AndreaP73330414', 'gordej_popov', 'skvorcov_trosha', 'IPracko', 'Tamekia66825359', 'hochu_pitsuu', 'ehidnaa_k', 'ollivimesh', 'kottowmak', 'grigfeds', 'zazerkal_v', 'ahmadiev_anton', 'apokalipsisa_s', 'aisa_letova', 'KMarmedova', 'vasencoo', 'osipovsah', 'svtojblinik_', 'licnyj_sort', 'polutonnaa', 'baskortotstan', 'marienmak', 'SofiMitrofanov_', 'oleghromov_', 'marknikoll', 'Jennife91361116', 'Jennife93704963', 'markinowa', 'tomatnyjsok2', 'justanaccsi', 'Krystal42545662', 'SasaKaramzin', 'kaztinina', 'JodiHeidenreic3', 'saharvvas', 'Misherbakov', 't_sledak', 'martinovpt']


In [15]:
# Use a simple loop for collecting bots' profiles into one dataframe:
profiles = []
for bot in bots_new_names:
    bot_profile = get_profile_details(twitter_username=bot)
    data = json.loads(bot_profile)
    profiles.append(data)
    #time.sleep(1)

df_bots_new = pd.json_normalize(profiles)

In [None]:
# Mark bot users by '1' 
df_bots_new['bot'] = 1

In [69]:
# Append old dataset with collected new data:
try:
    df = pd.concat([df, df_bots_new], axis=0, ignore_index=True)
except:
    df = df_bots_new

df

Unnamed: 0,id,id_str,name,screen_name,location,profile_location,description,url,protected,followers_count,...,status.retweeted_status.possibly_sensitive,status.retweeted_status.possibly_sensitive_editable,status.retweeted_status.self_thread.id,status.retweeted_status.self_thread.id_str,status.retweeted_status.entities.media,status.retweeted_status.extended_entities.media,status.self_thread.id,status.self_thread.id_str,entities.url.urls,bot
0,1.582274e+18,1.582274e+18,путешествие в маусвиль,MausvilV,,,,,False,0.0,...,,,,,,,,,,1
1,1.584492e+18,1.584492e+18,Елена Горбачева,lenagrbu,,,,,False,0.0,...,,,,,,,,,,1
2,1.557303e+18,1.557303e+18,Ева Карпова,EvaKarpoff,,,Просто Ева ❤️,,False,1.0,...,,,,,,,,,,1
3,1.584519e+18,1.584519e+18,Даниил Воробушкин,d_vorobuskin,,,,,False,0.0,...,,,,,,,,,,1
4,1.580882e+18,1.580882e+18,вечеринка с коктейлями и тако,SVecerinka,,,,,False,0.0,...,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
768,,,,,,,,,,,...,,,,,,,,,,1
769,,,,,,,,,,,...,,,,,,,,,,1
770,,,,,,,,,,,...,,,,,,,,,,1
771,,,,,,,,,,,...,,,,,,,,,,1


In [70]:
# Save our dataframe to .csv file:
df.to_csv('twitter_profiles_dataset.csv', encoding='utf-8', index=False)

Now I have a list of bot users and now it's time to mix them with real person users to get a train set for a bot classification problem.
The file "organic_names_IDs.txt" was provided by the same author.

In [71]:
organic_url = "organic_names_IDs.txt"
organic_df = pd.read_csv(organic_url, sep='\t', header=None, na_filter=False)
organic_names = list(organic_df[0].unique())
print('organic_names: ', len(organic_names))

organic_names:  551


The way of collecting organic profiles data is the same.

In [468]:
profiles = []
for organic in organic_names:
    organic_profile = get_profile_details(twitter_username=organic)
    data = json.loads(organic_profile)
    profiles.append(data)
    #time.sleep(1)

df_organic_new = pd.json_normalize(profiles)

Unnamed: 0,id,id_str,name,screen_name,location,profile_location,description,url,protected,followers_count,...,status.retweeted_status.place.bounding_box.coordinates,status.withheld_scope,status.withheld_in_countries,status.retweeted_status.withheld_scope,status.retweeted_status.withheld_in_countries,status.retweeted_status.quoted_status_id,status.retweeted_status.quoted_status_id_str,status.self_thread.id,status.self_thread.id_str,bot
0,1.510548e+18,1.510548e+18,Vladimir Vataman,VladimirVatama2,,,,,False,1.0,...,,,,,,,,,,0
1,1.510550e+18,1.510550e+18,Dzapo Lucci,Dimosyno,Третья от Солнца,,Растите ваши крылья поближе к голове.\nВо избе...,,False,80.0,...,,,,,,,,,,0
2,1.510732e+18,1.510732e+18,Светлана Грицкова,SGrickova,,,,,False,0.0,...,,,,,,,,,,0
3,1.510869e+18,1.510869e+18,Толендияр Сейдакбаров,tolendiar,,,,,False,0.0,...,,,,,,,,,,0
4,1.510928e+18,1.510928e+18,Петр Углов,UglovPP,,,,,False,0.0,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
546,1.583160e+18,1.583160e+18,Runner to Freedom,RunnertoFreedom,,,#nowar\nNo name writer,,False,0.0,...,,,,,,,,,,0
547,1.575539e+18,1.575539e+18,@Javi_DarkOSINT,JDarkosint,,,,,False,0.0,...,,,,,,,,,,0
548,1.580157e+18,1.580157e+18,Vahmurka,admiral_sraka,,,,,False,25.0,...,,,,,,,,,,0
549,1.582674e+18,1.582674e+18,Stellar Wildcat,WildStellarCat,,,,,False,0.0,...,,,,,,,,,,0


In [None]:
# Mark organic users by '0' 
df_organic_new['bot'] = 0

In [None]:
# Append old dataset with collected new data:
try:
    df = pd.concat([df, df_organic_new], axis=0, ignore_index=True)
except:
    df = df_organic_new

df

In [None]:
# Save our dataframe to .csv file:
df.to_csv('twitter_profiles_dataset.csv', encoding='utf-8', index=False)