In [7]:
import pandas as pd
import glob

from collections import Counter
import pandas as pd
from pathlib import Path
import snscrape.modules.twitter as sntwitter
import itertools
import math

import pickle
import os
import glob
import logging
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import snscrape.modules.twitter as sntwitter
import itertools
import pyarrow as pa
import pyarrow.parquet as pq
from pathlib import Path

from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess

import re


### PREPROCESSING
pd.options.display.max_rows=100
pd.options.display.max_columns = 100

def get_mentioned_users(x):
    users = []
    try:
        for i in range(len(x)):
            users = users + [x[i]['username']]
    except:
        pass
    return users

since = '2020-06-01'
until = '2020-12-31'
keywords = ['wuhan', 'ncov', 'coronavirus', 'covid', 'sars-cov-2', 'pandemic', 'lockdown', 'quarantine', 
'social distancing', 'wearing masks', 'vaccination', 'vaccine', 'outbreak', 'panic buying', 'remote working', 'homeschooling']
locations = {
    'india': '20.385825381874263,78.22265625000001,1159.759441155552km',
    'philippines': '12.211180191503997,124.45312500000001,932.0842548009142km',
    'nigeria': '9.058702156392139,7.866210937500001,597.7202418320643km',
    'malaysia': '4.149200693099289,102.43652343750001,294.4197592376132km',
    'indonesia1': '-0.3515602939922709,110.91796875000001,1832.0141760886796km',
    'indonesia2': '-4.214943141390639,135.87890625000003,788.510726515212km',
    'kenya': '-0.4614207935306084,37.50732421875001,494253.76183736156km'
}

# Load model
filename = 'news_model.sav'
news_model = pickle.load(open(filename, 'rb'))

# Load dictionary
filename = 'dic.sav'
dictionary = pickle.load(open(filename, 'rb'))

# Define list of names
names = []
for i in range(len(dictionary)):
    names.append(dictionary[i])

# Column names
colnames = ['username', 'displayname', 'userId', 'description', 'verified', 'created', 'followersCount', 
    'friendsCount', 'statusesCount', 'favouritesCount', 'listedCount', 'mediaCount', 'location', 
    'protected', 'linkUrl', 'profileImageUrl', 'profileBannerUrl', 'FreqCount', 'Place']

# Features
features = [
    'verified', 'usernameLink', 'age_in_days', 'tweets_per_day', 'media_per_tweet', 'listed_per_followers',
    'favourites_per_tweet', 'logFollowers', 'logListed', 'hasLocation', 'hasLink', 'hasBanner'
] + names

# NLTK Stop words
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'are', 'com', 'africa', 'african', 'south', 'american', 'america'])

def clean_data(text):
    return [re.sub(r"[^a-zA-Z0-9]+", ' ', k) for k in text.split("\n\n\n\n\n\n\n\n\n\n\n")][0].lower()

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def remove_pre_link(x):
    try:
        return x.strip('https://').strip('http://').strip('www.').split('.')[0]
    except:
        return x

### RUNNING
# Get tweets
users = pd.DataFrame()
for place in locations.keys():
    search = "{} filter:replies lang:en since:{} until:{} geocode:{}".format('''("''' + '''" OR "'''.join(keywords) + '''")''', since, until, locations[place]) 
    df = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(search).get_items(), None))
    if df.shape[0]==0:
        continue
    tmp = (list(itertools.chain.from_iterable(df.mentionedUsers.apply(lambda x: get_mentioned_users(x)))))
    tmp = pd.DataFrame(Counter(tmp).items(), columns=['account', 'count'])
    tmp = tmp[tmp['count']>math.ceil(df.shape[0]/5000)].sort_values('count', ascending=False)
    tmp['place'] = place
    users = users.append(tmp)
    print('finish', place)
users = users.reset_index(drop=True)

# Get list of users
print('Start collecting users...', len(users.account.unique()))
values = []
for user in users.account.unique():
    try:
        freqCount = users[users.account==user]['count'].values[0]
        place = users[users.account==user].place.values[0]
        s = sntwitter.TwitterProfileScraper(user).entity
        values.append([s.username, s.displayname, s.id, s.description, s.verified, s.created, 
            s.followersCount, s.friendsCount, s.statusesCount, s.favouritesCount, s.listedCount, 
            s.mediaCount, s.location, s.protected, s.linkUrl, s.profileImageUrl, s.profileBannerUrl,
            freqCount, place])
    except:
        pass
df = pd.DataFrame(values, columns=colnames)
print('Finished collecting users.')

# Fix dataframe
df = df.drop_duplicates('userId')
df['description'] = np.where(df.description.isnull(), '', df.description)
df = df.reset_index(drop=True)
df['created'] = pd.to_datetime(df.created, unit='ns')
df['created'] = df.created.dt.tz_localize(None)

# Create list of documents
docs = list(df.description.values)
for i, text in enumerate(docs):
    docs[i] = clean_data(docs[i])

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.
    
# Remove stopwords
docs = remove_stopwords(docs)

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 2] for doc in docs]

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=3)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

# Define OHE
ohe = []
for c in corpus:
    z = np.zeros(len(dictionary))
    for idx in range(len(c)):
        z[c[idx][0]] = c[idx][1]
    ohe.append(list(z))
ohe = pd.DataFrame(ohe, columns=names)
df = pd.concat([df, ohe], axis=1)

# Create features
df['usernameLink'] = df.linkUrl.map(lambda x: remove_pre_link(x)) == df.username
df['age_in_days'] = (pd.to_datetime('now').tz_localize(None) - df.created).dt.days
df['tweets_per_day'] = df.statusesCount / (1 + df.age_in_days)
df['media_per_tweet'] = df.mediaCount / (1 + df.statusesCount)
df['listed_per_followers'] = df.listedCount / (1 + df.followersCount)
df['favourites_per_tweet'] = df.favouritesCount / (1 + df.statusesCount)
df['logFollowers'] = np.log10(1 + df.followersCount)
df['logListed'] = np.log10(1 + df.listedCount)
df['hasLocation'] = ~(df.location.isnull())
df['hasLink'] = ~(df.linkUrl.isnull())
df['hasBanner'] = ~(df.profileBannerUrl.isnull())

# Predict news agency
df['pred'] = news_model.predict(df[features])

finish india
finish philippines
finish nigeria
finish malaysia
finish indonesia1
finish indonesia2
finish kenya
Start collecting users... 3067
Finished collecting users.


In [17]:
search

'("wuhan" OR "ncov" OR "coronavirus" OR "covid" OR "sars-cov-2" OR "pandemic" OR "lockdown" OR "quarantine" OR "social distancing" OR "wearing masks" OR "vaccination" OR "vaccine" OR "outbreak" OR "panic buying" OR "remote working" OR "homeschooling") filter:replies lang:en since:2020-06-01 until:2020-12-31 geocode:-0.4614207935306084,37.50732421875001,494253.76183736156km'

In [18]:
'filter:replies lang:en since:2020-06-01 until:2020-12-31 geocode:' + '-0.3515602939922709,110.91796875000001,1832.0141760886796km'

'filter:replies lang:en since:2020-06-01 until:2020-12-31 geocode:-0.3515602939922709,110.91796875000001,1832.0141760886796km'

In [8]:
df[df.pred].to_csv('pred.csv')

In [16]:
df.Place.value_counts()

india          1274
kenya           258
philippines      84
Name: Place, dtype: int64

In [19]:
df[df.pred]

Unnamed: 0,username,displayname,userId,description,verified,created,followersCount,friendsCount,statusesCount,favouritesCount,listedCount,mediaCount,location,protected,linkUrl,profileImageUrl,profileBannerUrl,FreqCount,Place,life,health,news,member,like,president,proud,host,love,show,account,new,official,official_twitter,twitter,author,founder,father,follow,husband,opinion,tweet,former,world,politics,mom,time,wife,book,political,dad,house,endorsement,people,medium,state,senator,governor,breaking,cnn,usernameLink,age_in_days,tweets_per_day,media_per_tweet,listed_per_followers,favourites_per_tweet,logFollowers,logListed,hasLocation,hasLink,hasBanner,pred
18,ndtv,NDTV,37034483,Breaking news alerts from India.\n\nInstagram:...,True,2009-05-01 20:34:48,15616566,15,818219,0,13348,409190,India,False,http://www.ndtv.com/,https://pbs.twimg.com/profile_images/570440108...,https://pbs.twimg.com/profile_banners/37034483...,478,india,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,True,4416,185.243151,0.500098,0.000855,0.0,7.193586,4.125449,True,True,True,True
19,ANI,ANI,355989081,Asian News International. Multi-media news age...,True,2011-08-16 05:23:41,5741178,0,483752,0,5368,247930,India,False,http://www.aninews.in,https://pbs.twimg.com/profile_images/149786429...,https://pbs.twimg.com/profile_banners/35598908...,474,india,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,False,3580,135.088523,0.512514,0.000935,0.0,6.759001,3.729893,True,True,True,True
20,aajtak,AajTak,42606652,"AajTak covers breaking news, latest news in po...",True,2009-05-26 11:31:00,13412978,383,545015,1467,4203,173117,India,False,http://www.aajtak.in,https://pbs.twimg.com/profile_images/139525407...,https://pbs.twimg.com/profile_banners/42606652...,461,india,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,True,4392,124.064421,0.317637,0.000313,0.002692,7.127525,3.623663,True,True,True,True
38,timesofindia,The Times Of India,134758540,News. Views. Analysis. Conversations. India’s ...,True,2010-04-19 10:50:15,13671575,470,665942,5,11987,306959,New Delhi,False,http://www.timesofindia.com,https://pbs.twimg.com/profile_images/112966666...,https://pbs.twimg.com/profile_banners/13475854...,327,india,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,4064,163.82337,0.460939,0.000877,8e-06,7.135819,4.078747,True,True,True,True
40,TimesNow,TIMES NOW,240649814,TIMES NOW is India’s most watched English news...,True,2011-01-20 12:17:23,10058892,377,674164,4,5451,361333,India,False,http://www.timesnownews.com,https://pbs.twimg.com/profile_images/135407453...,https://pbs.twimg.com/profile_banners/24064981...,315,india,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,3788,177.92663,0.535971,0.000542,6e-06,7.00255,3.736556,True,True,True,True
41,IndiaToday,IndiaToday,19897138,"Brings you news breaks: Exclusive political, e...",True,2009-02-02 07:21:54,5700981,248,879408,4445,6045,256677,República da Índia,False,,https://pbs.twimg.com/profile_images/139525367...,https://pbs.twimg.com/profile_banners/19897138...,304,india,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,4505,195.163782,0.291874,0.00106,0.005055,6.75595,3.781468,True,False,True,True
49,ABPNews,ABP News,39240673,Follow for latest news alerts from India,True,2009-05-11 12:25:51,11397673,43,319015,136,4372,86783,India,False,http://abplive.com,https://pbs.twimg.com/profile_images/134032467...,https://pbs.twimg.com/profile_banners/39240673...,242,india,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,4407,72.371824,0.272033,0.000384,0.000426,7.056816,3.640779,True,True,True,True
61,htTweets,Hindustan Times,36327407,One of India's largest media companies. Latest...,True,2009-04-29 10:11:34,8077505,132,898966,2929,7501,355028,India,False,http://www.hindustantimes.com,https://pbs.twimg.com/profile_images/130014533...,https://pbs.twimg.com/profile_banners/36327407...,200,india,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,False,4419,203.385973,0.394929,0.000929,0.003258,6.907277,3.875177,True,True,True,True
71,DDNewslive,DD News,1100927498,"Official Twitter account of DD News, the Publi...",True,2013-01-18 13:04:45,3233524,82,192877,2668,2953,83695,"New Delhi, India",False,http://www.ddnews.gov.in,https://pbs.twimg.com/profile_images/109603088...,https://pbs.twimg.com/profile_banners/11009274...,187,india,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,3059,63.031699,0.433927,0.000913,0.013833,6.509676,3.47041,True,True,True,True
122,IndianExpress,The Indian Express,38647512,The Indian Express brings to you latest news f...,True,2009-05-08 11:29:15,3920126,314,790746,8,5624,123990,India,False,https://indianexpress.com/,https://pbs.twimg.com/profile_images/106459960...,https://pbs.twimg.com/profile_banners/38647512...,111,india,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,False,4410,179.266833,0.156801,0.001435,1e-05,6.5933,3.750123,True,True,True,True
