In [1]:
!pip install gensim==3.8.3



In [2]:
!pip install wikipedia



In [3]:
import sys
import os
import itertools as it
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn import decomposition as skd
from sklearn import feature_extraction as skfe
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import gensim
from gensim.models.doc2vec import TaggedDocument
import re
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from collections import Counter
import wikipedia
import difflib
import nltk
#nltk.download('punkt')
#nltk.download('stopwords')

wikipedia.set_lang('en')



In [4]:
#import data

src_dir = r'C:\Users\elizabeth\Documents\cleaned tweets\cyber friends tweets'

files = [file for file in os.listdir(src_dir) if os.path.isfile(os.path.join(src_dir, file))] # build list of files to iterate through

data = pd.DataFrame()

with tqdm(total=len(files), desc='Files') as pbar:
    for file in files:
        df = pd.read_csv(os.path.join(src_dir, file), low_memory=False)
        data = pd.concat([data, df[['tweet_id', 'text', 'hashtags']]], axis=0)
        pbar.update(1)
        
data = data[data['hashtags'].astype(str) != '[]'] # remove empties
data['text'] = data['text'].apply(lambda x: x.lower() if isinstance(x, str) else None)
data

Files: 100%|███████████████████████████████████████████████████████████████████████████| 40/40 [01:48<00:00,  2.70s/it]


Unnamed: 0,tweet_id,text,hashtags
1,1293934001514848256,rt breaking via fbi to join beirut blast probe...,"['breaking', 'fbi', 'beirut', 'probe']"
2,1293580602055299072,rt breaking lebanon prosecutor to question sev...,"['breaking', 'beirutblast']"
6,1292021027543953408,rt according to the lebanese health ministry o...,['beirutblast']
9,1291401752911257606,rt breaking imf urges lebanon to break reform ...,"['breaking', 'lebanon']"
12,1291331749956509698,macron says will pitch new political deal to l...,['lebanon']
...,...,...,...
87365,1270406159355334656,rt another fascinating panel this afternoon th...,"['trust', 'technology']"
87379,1268578990220029953,rt join this cogx panel to hear the latest exp...,['cogx2020']
87437,1260516465435906050,rt join us today bst to discuss whether a join...,"['covid19uk', 'contacttracing']"
87439,1260119164502659072,rt could ai get us out of lockdown a group of ...,['ai']


#### Here are some group hashtag topics

In [None]:
topics = [['covid19', 'lockdown', 'coronavirus', 'mentalhealth', 'healthcare'], 
['ai', 'ml', 'dtascience', 'artificalintelligence'], 
['cybersecurity', 'iot', 'robotics', 'infosec'],
['fintech'],
['blockchain', 'payments', 'cryptocurrency', 'bitcoin', 'cyrpto'],
['innovation', 'tech'],
['data', 'gdpr'],
['brexit'],
['privacy'],
['startup company'],
['china'],
['5g'],
['digitaltransformation'],
['beirut', 'lebanon'],
['belarus'],
['journalism', 'travel'], 
['apple'],
['ar', 'vr'], 
['property', 'realestate'], 
['architecture', 'design'],
['climatechange', 'esg', 'sustainability'],
['cybercrime', 'ransomeware', 'malware'],
['facebook', 'socialmedia', 'twitter']]

#### assign topics to tweets

In [None]:
def check_for_matches(tag_list, topic_list):
    '''
    
    '''
    matches = []
    
    # Compare list of hashtags in tweet against those in each topic
    potential_matches = [bool(set(tag_list).intersection(set(topic))) for topic in topic_list]
    if any(potential_matches):
        matches.extend([i for i in range(len(potential_matches)) if potential_matches[i]])
    else:
        matches.append(-1)
    return matches

possible_topics = data['hashtags'].apply(lambda x: check_for_matches(eval(x), topics))
possible_topics

In [None]:
labelled_data = data.assign(topic_labels=possible_topics.values)
labelled_data['main_label'] = labelled_data['topic_labels'].apply(lambda x: x[0])
labelled_data

In [None]:
labelled_data['main_label'].value_counts().plot.barh()
fig = plt.gcf()
fig.set_size_inches(10,15)
plt.xscale('log')
plt.show()

In [None]:
value_counts = labelled_data['main_label'].value_counts()
value_counts


In [5]:
topics = [['covid19', 'lockdown', 'coronavirus', 'mentalhealth', 'healthcare'], 
['ai', 'ml', 'dtascience', 'artificalintelligence'], 
['cybersecurity', 'iot', 'robotics', 'infosec'],
['fintech', 'blockchain', 'payments', 'cryptocurrency', 'bitcoin', 'cyrpto']]

In [6]:
def check_for_matches(tag_list, topic_list):
    '''
    
    '''
    matches = []
    
    # Compare list of hashtags in tweet against those in each topic
    potential_matches = [bool(set(tag_list).intersection(set(topic))) for topic in topics]
    if any(potential_matches):
        matches.extend([i for i in range(len(potential_matches)) if potential_matches[i]])
    else:
        matches.append(4)
    return matches

possible_topics = data['hashtags'].apply(lambda x: check_for_matches(eval(x), topics))

labelled_data = data.assign(topic_labels=possible_topics.values)
labelled_data['main_label'] = labelled_data['topic_labels'].apply(lambda x: x[0])
labelled_data

Unnamed: 0,tweet_id,text,hashtags,topic_labels,main_label
1,1293934001514848256,rt breaking via fbi to join beirut blast probe...,"['breaking', 'fbi', 'beirut', 'probe']",[4],4
2,1293580602055299072,rt breaking lebanon prosecutor to question sev...,"['breaking', 'beirutblast']",[4],4
6,1292021027543953408,rt according to the lebanese health ministry o...,['beirutblast'],[4],4
9,1291401752911257606,rt breaking imf urges lebanon to break reform ...,"['breaking', 'lebanon']",[4],4
12,1291331749956509698,macron says will pitch new political deal to l...,['lebanon'],[4],4
...,...,...,...,...,...
87365,1270406159355334656,rt another fascinating panel this afternoon th...,"['trust', 'technology']",[4],4
87379,1268578990220029953,rt join this cogx panel to hear the latest exp...,['cogx2020'],[4],4
87437,1260516465435906050,rt join us today bst to discuss whether a join...,"['covid19uk', 'contacttracing']",[4],4
87439,1260119164502659072,rt could ai get us out of lockdown a group of ...,['ai'],[1],1


In [7]:
import random

In [11]:
sample_df = pd.DataFrame()
for label in range(0,5):
    rnd_idx = random.sample(range(labelled_data[labelled_data['main_label']==label].shape[0]), 5000)
    sample_df = pd.concat([sample_df, labelled_data[labelled_data['main_label']==label].iloc[rnd_idx]], axis=0)
sample_df

Unnamed: 0,tweet_id,text,hashtags,topic_labels,main_label
57949,1290799897470525448,over of the world s workers rely on the inform...,"['covid19', 'buildforwardbetter']",[0],0
17074,1238910836514717696,powerful rebuke by of s cowardice insisting th...,['coronavirus'],[0],0
81538,1272540779970469890,hands up who s guilty of junk miles training t...,"['training', 'triathlon', 'swimming', 'cycling...",[0],0
65321,1293178121294938113,rt investing in testing contact tracing and pu...,['covid19'],[0],0
5274,1156488530937466880,if you re feeling the pressure this summer you...,"['prrequest', 'journorequest', 'burnout', 'men...",[0],0
...,...,...,...,...,...
11063,1187711970708996096,rt at the end of a fascinating st asia europe ...,"['asia', 'europe', 'geopolitics', 'connectivity']",[4],4
58547,1286748461258268674,we had so many great submissions for hacktivit...,['hacktivitycon2020'],[4],4
17800,1011627067425873920,h marseille passons la nuit ensemble stonesnof...,['stonesnofilter'],[4],4
16414,1294688845934534656,wonderful wonderful truly wonderful men tomorr...,['tomorrowwillbeagoodday'],[4],4


In [12]:
sample_df['main_label'].unique()

array([0, 1, 2, 3, 4], dtype=int64)

In [18]:
grouped = sample_df.groupby('main_label')['text'].agg(' '.join)
grouped


TypeError: sequence item 111: expected str instance, NoneType found

In [13]:
grouped = sample_df.groupby(['main_label'])[['text']].sum()
grouped

Unnamed: 0_level_0,text
main_label,Unnamed: 1_level_1
0,over of the world s workers rely on the inform...
1,microsoft s new app uses ai to help visually i...
2,rt for the fda digital security category we ar...
3,genuinely genuinely fintechrt breaking billion...
4,


In [None]:
!pip install wordcloud

In [None]:
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

In [None]:
def wordcloud_plot(text):
    wordcloud = WordCloud(collocations=False, 
                          width = 1000, 
                          height = 700, 
                          background_color ='white', 
                          min_font_size = 10).generate(text)
    
    plt.figure(figsize = (8,8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.show()

In [None]:
wordcloud_plot(grouped['text'][0])

In [None]:
wordcloud_plot(grouped['text'][1])

In [None]:
wordcloud_plot(grouped['text'][2])

In [None]:
wordcloud_plot(grouped['text'][3])

In [None]:
wordcloud_plot(grouped['text'][4])

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if (len(word) >= min_characters_word) & (word not in stop_words) : tokens.append(word)
#            if (len(word) >= min_characters_word) : tokens.append(word)
    return tokens


# Pre-process training data

Adapted from Martin's code ref:  https://towardsdatascience.com/multi-class-text-classification-with-doc2vec-logistic-regression-9da9947b43f4

In [None]:
min_characters_sent = 3   #Min characters in a paragraph (inclusive)
min_characters_word = 3     #Min characters in a word (inclusive)
test_size = 0.2     #Fraction of corpus to keep back for testing

In [None]:
%time labelled_data['tokens'] = labelled_data['text'].apply(lambda x: tokenize_text(str(x)))

In [None]:
train, test = train_test_split(labelled_data[['text', 'main_label', 'tokens']], test_size=test_size, random_state=42)

train_tagged = train.apply(
    lambda r: TaggedDocument(words=r['tokens'], tags=[str(r.main_label)]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=r['tokens'], tags=[str(r.main_label)]), axis=1)

# Apply models

# # Distributed bag of words


In [None]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [None]:
#model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample=0, workers=cores) #Values from tutorial

model_dbow = Doc2Vec(dm=0, vector_size=500, negative=5, hs=1, min_count=2, sample=0, workers=cores) #My optimised values

model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

In [None]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

In [None]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [None]:
# Distributed Bag of Words (DBOW) model

y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)
logreg_dbow = LogisticRegression(n_jobs=1, C=1e5, max_iter=1000)
logreg_dbow.fit(X_train, y_train)
y_pred = logreg_dbow.predict(X_test)
print('xgb_model_dbow Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('xgb_model_dbow Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

# Distributed memory


In [None]:
#model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)  #Values from tutorial

model_dmm = Doc2Vec(dm=1, dm_mean=0, vector_size=500, window=10, negative=5, min_count=1, workers=cores, alpha=0.1, min_alpha=0) #My optimised values

model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

In [None]:
%%time
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

In [None]:
# Distributed Memory (DM) model

y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)

logreg_dmm = LogisticRegression(n_jobs=1, C=1e5, max_iter=1000)
logreg_dmm.fit(X_train, y_train)
y_pred = logreg_dmm.predict(X_test)

print('model_dmm Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('model_dmm Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

# Combined model pairing

In [None]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
model_new = ConcatenatedDoc2Vec([model_dbow, model_dmm])

In [None]:
def get_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [None]:
# Model Pairing method

y_train, X_train = get_vectors(model_new, train_tagged)
y_test, X_test = get_vectors(model_new, test_tagged)
logreg_new = LogisticRegression(n_jobs=1, C=1e5, max_iter=1000)
logreg_new.fit(X_train, y_train)
y_pred = logreg_new.predict(X_test)
print('xgb_model_new Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('xgb_model_new Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

In [None]:
#Free up memory (not necessary for our small training sample)
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

# Apply to tweets

In [None]:
def get_vectors_apply(model, docs_to_classify):
    sents = docs_to_classify.values
    regressors = [model.infer_vector(doc.words, steps=20) for doc in sents]
    return regressors

In [None]:
def clean_text(text):
    text = re.sub(r"http\S+", "", str(text)) #remove urls
    text = re.sub(r'\S+\.com\S+','',str(text)) #remove urls
    text = re.sub(r'\@\w+','',str(text)) #remove mentions
    text =re.sub(r'\#','',str(text)) #remove hashtags
    text = re.findall(r'[A-Za-z]+',str(text))
    text = ' '.join(text) 
    return text

In [None]:
src_dir = r'C:\Users\elizabeth\Documents\cleaned tweets\cyber friends tweets'

files = [file for file in os.listdir(src_dir) if os.path.isfile(os.path.join(src_dir, file))]
classifier_model = logreg_dbow
model = model_new

with tqdm(total=len(files), desc='Files') as pbar:
    i=0
    for file in files[:2]:
        df = pd.read_csv(os.path.join(src_dir, file), low_memory=False)
        df = df[df.text != '']
        df['clean_tweet'] = df['text'].apply(lambda x: clean_text(x))
        test_tagged = df.apply(
            lambda r: TaggedDocument(words=tokenize_text(r['clean_tweet']), tags=[r.screen_name]), axis=1)
        X_test = get_vectors_apply(model, test_tagged)
        df['y_pred'] = classifier_model.predict(X_test)
        y_pred_score = classifier_model.predict_proba(X_test)
        df2 = pd.DataFrame(y_pred_score)
        df2.columns=classifier_model.classes_
        df2['score'] = df2.max(axis=1)
        df['score'] = df2['score']
        df3 = df[['tweet_id','screen_name', 'text', 'y_pred', 'score']].copy()
        df4 = pd.concat([df3, df2], axis=1)
        df3.to_csv('tweets_trained_topic_modelled_'+str(i)+'.csv', index=False)
        df4.to_csv('tweets_trained_all_topics_modelled_'+str(i)+'.csv', index=False)
        print(df4.head())
        pbar.update(1)
        i = i+1


# How are journalists interests are spread?

first we need to drop the row where the classification has been less than 0.5, then we can pool over user_name

In [None]:
src_dir = r'C:\Users\elizabeth\Documents\S2DS\tweets_trained_all_topics'

files = [file for file in os.listdir(src_dir) if os.path.isfile(os.path.join(src_dir, file))]
ls = list(logreg_dbow.classes_)
ls.insert(0, 'screen_name')
df_totalled_topics = pd.DataFrame(columns=ls)

list(df_totalled_topics)

In [None]:
topics = [['covid19', 'lockdown', 'coronavirus', 'mentalhealth', 'healthcare'], 
['ai', 'ml', 'dtascience', 'artificalintelligence'], 
['cybersecurity', 'iot', 'robotics', 'infosec'],
['fintech'],
['blockchain', 'payments', 'cryptocurrency', 'bitcoin', 'cyrpto']]

In [None]:
with tqdm(total=len(files), desc='Files') as pbar:
    for file in files:
        df = pd.read_csv(os.path.join(src_dir, file), low_memory=False)
        df1 = df.groupby('screen_name')['-1', '0', '1', '10',
                                        '11', '12', '13', '14', '15',
                                        '16', '17', '18', '19', '2',
                                        '20', '21', '22', '3', '4',
                                        '5', '6', '7', '8'].sum()
        df_totalled_topics = pd.concat([df_totalled_topics, df1])
        pbar.update(1)
df_totalled_topics.head()


In [None]:
df_totalled_topics['total'] = df_totalled_topics.sum(axis = 1)

In [None]:
df2 = df_totalled_topics[['-1', '0', '1', '10',
                                        '11', '12', '13', '14', '15',
                                        '16', '17', '18', '19', '2',
                                        '20', '21', '22', '3', '4',
                                        '5', '6', '7', '8']].div(df_totalled_topics.total, axis=0)
df2


In [None]:
df2 = df2.reset_index()
df2 = df2.rename(columns={'index': 'user_name'})
df2

In [None]:
df2.to_csv('user_name_topics_summed.csv', index=False)


In [None]:
df3 = df2.set_index('user_name')

In [None]:


x = [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06]
labels = list(df3.columns)
row1 = df3.iloc[0]
row1.plot(kind='bar',title='Oxchich', color='r',stacked=False, figsize=(15,5))





In [None]:
x = [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06]
labels = list(df3.columns)
row1 = df3.iloc[1]
row1.plot(kind='bar',title='_benkatz', color='r',stacked=False, figsize=(15,5))


In [None]:
x = [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06]
labels = list(df3.columns)
row1 = df3.iloc[2]
row1.plot(kind='bar',title='_john_handel', color='r',stacked=False, figsize=(15,5))

In [None]:
x = [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06]
labels = list(df3.columns)
row1 = df3.loc['gcluley']
row1.plot(kind='bar',title='_john_handel', color='r',stacked=False, figsize=(15,5))

# Wordclouds for a sense check 

In [None]:
!pip install wordcloud


In [None]:
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

In [None]:
def wordcloud_plot(text):
    wordcloud = WordCloud(collocations=False, 
                          width = 1000, 
                          height = 700, 
                          background_color ='white', 
                          min_font_size = 10).generate(text)
    
    plt.figure(figsize = (8,8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.show()

In [None]:
src_dir = r'C:\Users\elizabeth\Documents\S2DS\tweets_trained_topic_modelled'

files = [file for file in os.listdir(src_dir) if os.path.isfile(os.path.join(src_dir, file))]
with tqdm(total=len(files), desc='Files') as pbar:
    for file in files[:5]:
        df = pd.read_csv(os.path.join(src_dir, file), low_memory=False)
        df1 = df.groupby(['y_pred'])[['text']].sum()
        pbar.update(1)
        



In [None]:
df1.head(3).style.set_properties(subset=['text'], **{'width':'1000px'})

In [None]:
wordcloud_plot(df1['text'][0])


In [None]:
wordcloud_plot(df1['text'][1])

In [None]:
wordcloud_plot(df1['text'][2])

In [None]:
wordcloud_plot(df1['text'][3])