# Import Python packages needed for this pilot proposal

In [None]:
# General:
import tweepy           # To consume Twitter's API
import pandas as pd     # To handle data
import numpy as np      # For number computing

# For plotting and visualization:
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pickle


### The following credentials from my personal twitter developer account, with tag dingkaihua. Twitter server changes token often. If interested, the twitter developer application link is here developers https://developer.twitter.com/en/apply-for-access.

In [None]:
# Twitter App access keys for @dingkaihua

# Consume:
CONSUMER_KEY    = 'XXX'
CONSUMER_SECRET = 'XXX'

# Access:
ACCESS_TOKEN  = 'XXX'
ACCESS_SECRET = 'XXX'


In [None]:
#Authenticate
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
api = tweepy.API(auth)

# The "credentials" packages makes access more convenient

In [None]:
!pip install credentials



In [None]:
# Import our access keys:
from credentials import *    # This will allow us to use the keys as variables

# API's setup:
def twitter_setup():
    """
    Utility function to setup the Twitter's API
    with our access keys provided.
    """
    # Authentication and access using keys:
    auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)

    # Return API with authentication:
    api = tweepy.API(auth)
    return api


In [None]:
!pip install --upgrade git+https://github.com/tweepy/tweepy@master

Collecting git+https://github.com/tweepy/tweepy@master
  Cloning https://github.com/tweepy/tweepy (to revision master) to /tmp/pip-req-build-sg84uqi4
  Running command git clone -q https://github.com/tweepy/tweepy /tmp/pip-req-build-sg84uqi4


In [None]:
#search_words ='**KEYWORD/HASHTAG/USERNAME**'
search_words ='Tanganyika'
date_since = "01-01-2020"
date_since_pro = "202008130000"
numTweets = 3000

# standart search
#tweets = tweepy.Cursor(api.search_tweets, q=search_words, since=date_since).items(numTweets)

# premium search
#tweets=tweepy.Cursor(api.search_full_archive,environment_name='**ENV NAME FROM API**', query=search_words, fromDate=date_since_pro).items(numTweets)


In [None]:
results_tanganyika = api.search_tweets(q=search_words,
                  count=numTweets,
                  since_id=date_since)

Save tweet search results

In [None]:

with open('/content/drive/MyDrive/Colab_Notebooks/Baris_Ata/tweet_search.pkl', 'wb') as f:
  pickle.dump(results_tanganyika, f)

#Process tweets and write out a data frame

Turn search results into json

In [None]:
# prepare a list of tweet_json
tweet_json = []

for tweet_search in results_tanganyika:
  tweet_json.append(tweet_search._json)


In [None]:
from textblob import TextBlob

In [None]:
!pip install emoji



In [None]:
import pandas as pd
import re
import emoji
import nltk
nltk.download('words')
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
def cleaner(tweet):
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    tweet = ''.join(c for c in tweet if c not in emoji.UNICODE_EMOJI) #Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet) \
         if w.lower() in words or not w.isalpha())
    return tweet

In [None]:
import textblob.exceptions


usr_name = []
usr_location = []
tweet_text = []
tweet_text_en = []
tweet_geo =[]
tweet_place = []
lang = []
tweet_time=[]

for tweet in tweet_json:
  usr_name.append(tweet['user']['name'])
  usr_location.append(tweet['user']['location'])
  tweet_text.append(cleaner(tweet['text']))
  lang.append(tweet['lang'])
  if (tweet['lang'] == 'en') or  (tweet['lang'] == 'eu') or (tweet['lang'] == 'in'):
    tweet_text_en.append(tweet['text'])
  else:  
    blob = TextBlob(cleaner(tweet['text']))
    try:
      tweet_text_en.append(blob.translate(to='en'))
    except textblob.exceptions.NotTranslated:
      tweet_text_en.append(tweet['text'])
  
  tweet_geo.append(tweet['geo'])
  tweet_time.append(tweet['created_at'])
  

In [None]:
# create data frame for the ease of viewing things

tweet_df = pd.DataFrame()
tweet_df['User Name'] = usr_name
tweet_df['User Location'] = usr_location
tweet_df['Text'] = tweet_text
tweet_df['English Text'] = tweet_text_en

tweet_df['Language'] = lang
tweet_df['Time'] = tweet_time
tweet_df['geo'] = tweet_geo

In [None]:
with open('/content/drive/MyDrive/Colab_Notebooks/Baris_Ata/processed_tweets.pkl', 'wb') as f:
  pickle.dump(tweet_df, f)




In [None]:
len(tweet_json[0].keys())

25

# Named entitiy recognition

In [None]:
import pickle
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls /content/drive/MyDrive/Colab_Notebooks/Baris_Ata/processed_tweets.pkl

/content/drive/MyDrive/Colab_Notebooks/Baris_Ata/processed_tweets.pkl


In [None]:
with open('/content/drive/MyDrive/Colab_Notebooks/Baris_Ata/processed_tweets.pkl', 'rb') as f:
  tweet_df = pickle.load(f)

In [None]:
tweet_df.head(10)

Unnamed: 0,User Name,User Location,Text,English Text,Language,Time,geo
0,Lumumba puVision 1,,: 🇨🇩 L ’ de de a net par la de . …,"(:, , L, , ’, d, e, , d, e, , a, , n, e, ...",fr,Wed Aug 25 20:19:20 +0000 2021,
1,Tresor M Vutungire,République Démocratique Du Con,: 🇨🇩 L ’ de de a net par la de . …,"(:, , L, , ’, d, e, , d, e, , a, , n, e, ...",fr,Wed Aug 25 20:19:04 +0000 2021,
2,Rogo,,: 🇨🇩 L ’ de de a net par la de . …,"(:, , L, , ’, d, e, , d, e, , a, , n, e, ...",fr,Wed Aug 25 20:18:17 +0000 2021,
3,Congo Nouveau,République Démocratique Du Con,: 🇨🇩 L ’ de de a net par la de . …,"(:, , L, , ’, d, e, , d, e, , a, , n, e, ...",fr,Wed Aug 25 20:17:08 +0000 2021,
4,ALLIANCE Humanitaire ONG,,: 25 . 08 . 2021 : de remise bureaux de l ' de...,"(:, , 2, 5, ., , 0, 8, ., , 2, 0, 2, 1, :, ...",fr,Wed Aug 25 20:16:57 +0000 2021,
5,angekapepula,,: 🇨🇩 L ’ de de a net par la de . …,"(:, , L, , ’, d, e, , d, e, , a, , n, e, ...",fr,Wed Aug 25 20:16:45 +0000 2021,
6,Sagesse Pratique,,: 🇨🇩 L ’ de de a net par la de . …,"(:, , L, , ’, d, e, , d, e, , a, , n, e, ...",fr,Wed Aug 25 20:15:53 +0000 2021,
7,Kitengie Kwibwe,,: 🇨🇩 L ’ de de a net par la de . …,"(:, , L, , ’, d, e, , d, e, , a, , n, e, ...",fr,Wed Aug 25 20:15:26 +0000 2021,
8,αкα ♛ 𝙰𝙻𝙴𝚇𝙰𝙽𝙳𝚁𝙴,République Démocratique Du Congo,: 🇨🇩 L ’ de de a net par la de . …,"(:, , L, , ’, d, e, , d, e, , a, , n, e, ...",fr,Wed Aug 25 20:15:25 +0000 2021,
9,Caleb faray,République Démocratique Du Con,: 🇨🇩 L ’ de de a net par la de . …,"(:, , L, , ’, d, e, , d, e, , a, , n, e, ...",fr,Wed Aug 25 20:14:51 +0000 2021,


In [None]:
# extract tweet sentences
tweet_list = tweet_df['English Text'].values
raw_tweets_en = []
for tweet in tweet_list:
  raw_tweets_en.append(str(tweet)) # convert all text blob object into text

  


In [None]:
languages = tweet_df['Language'].values
dict_lang = {}

for language in languages:
  if (language in dict_lang.keys()):
    dict_lang[language] = dict_lang[language] +1 
  else:
    dict_lang[language] = 1 


In [None]:
dict_lang

{'en': 35, 'fr': 46, 'in': 14, 'pl': 1, 'pt': 1, 'tl': 1, 'und': 2}

In [None]:
locations = tweet_df['User Location'].values
type(locations[0])

str

In [None]:
locations = tweet_df['User Location'].values
dict_location = {}

for location in locations:
  
  if (location in dict_location.keys()):
    dict_location[location] = dict_location[location] +1 
  else:
    dict_location[location] = 1 

In [None]:
dict_location

{'': 36,
 'Accraaaaaa!': 1,
 'Africa': 2,
 'Arusha, Tanzania': 2,
 'Attiki, Greece': 1,
 'Brooklyn, NY': 1,
 'Bruxelles, Belgique': 1,
 'Bujumbura': 1,
 'Bujumbura  Burundi ': 1,
 'Burundi': 1,
 'Dar es Salaam, Tanzania': 8,
 'Dodoma,Tanzania.': 1,
 'France': 1,
 'From Earth to Mars': 1,
 'Gihosha,Bujumbura-Burundi': 1,
 'Ivory Coast': 1,
 'K I N S H A S A': 1,
 "Kw'isi y'uBurundi ": 1,
 'London, England': 1,
 'Lubumbashi': 1,
 'Mara, Tanzania': 2,
 'Mwanza, Tanzania': 1,
 'Nairobi, Kenya': 1,
 'New Delhi': 1,
 'New York, USA': 1,
 'Plot 100/654 Ibex Hill, Lusaka': 1,
 'Plus Beau Pays du monde🇨🇩❤👌': 1,
 'Pool, Congo': 1,
 'RD CONGO🇨🇩': 1,
 'RDC': 1,
 'RDC/France/Luxembourg': 1,
 'RDC/Goma': 1,
 'Rd Congo, Lubumbashi': 1,
 'Rep. Dém du Congo': 1,
 'République Démocratique Du Con': 5,
 'République Démocratique Du Congo': 1,
 'République du Rwanda': 1,
 'Salta, Argentina': 1,
 'Tanzania': 7,
 'Zanzibar West, Tanzania': 3,
 'earth. ': 1,
 'mathetopetonia': 1,
 'भारत': 1}

In [None]:
tweet_df[tweet_df['Language']== 'pl']

Unnamed: 0,User Name,User Location,Text,English Text,Language,Time,geo
29,ɱTαɳɠαɳყιƙα❄,"Zanzibar West, Tanzania",ni,@rollymsouth hapana ni muuza ukwaju,pl,Wed Aug 25 19:02:10 +0000 2021,


In [None]:
stopwords_hand_generated = [ "\n", "RT"]

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)


#tweets_sw_removal = []
def stopwords_removal_naive(raw_tweets_en):
  tweets_sw_removal = []
  for sentence in raw_tweets_en:
    #print('before: ' + sentence)
    for word in stopwords_hand_generated:
      token = " " + word + " "
      sentence = sentence.replace(token, " ")
      sentence = sentence.replace("  ", " ")
      sentence = re.sub('@[^\s]+','',sentence) #remove user handle
      sentence = re.sub('@ [^\s]:+','',sentence)
      sentence = re.sub(r"http\S+", "", sentence) # remove url
      
      
      sentence = sentence.replace("#", " ")
      sentence = sentence.replace("\n", " ")
      sentence = sentence.replace("LePhoenix84", "")
      sentence = sentence.replace("@", "")
      
      
      sentence = re.sub('RT+','',sentence) #remove RT
      sentence = deEmojify(sentence) #remove emoji
      
    tweets_sw_removal.append(sentence)
  #print('after:  ' + sentence)
  return tweets_sw_removal

In [None]:
tweets_sw_removal = stopwords_removal_naive(raw_tweets_en)
len(tweets_sw_removal)

100

#Named entity recognition

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
from nltk import ne_chunk
nltk.download('maxent_ne_chunker')


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.


True

In [None]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [None]:
test1=preprocess(tweets_sw_removal[0])

In [None]:
# identify issue through pattern
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [None]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(test1)
print(cs)

NameError: ignored

In [None]:
ne_tree = ne_chunk(pos_tag(word_tokenize(tweets_sw_removal[0])))
print(ne_tree)

#SpaCy named entity recognition

In [None]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
doc = nlp(tweets_sw_removal[0])
pprint([(X.text, X.label_) for X in doc.ents])

[('L ’de de a net par', 'ORG')]


In [None]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(:, 'O', ''),
 (L, 'B', 'ORG'),
 (’, 'I', 'ORG'),
 (de, 'I', 'ORG'),
 (de, 'I', 'ORG'),
 (a, 'I', 'ORG'),
 (net, 'I', 'ORG'),
 (par, 'I', 'ORG'),
 (la, 'O', ''),
 (de, 'O', ''),
 (., 'O', ''),
 (..., 'O', '')]


# A test wtih beautiful soup

In [None]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

In [None]:
labels = [x.label_ for x in article.ents]
Counter(labels)

In [None]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

In [None]:
for tweet in tweets_sw_removal:
  print(tweet)

: L ’de de a net par la de. ...
: L ’de de a net par la de. ...
: L ’de de a net par la de. ...
: L ’de de a net par la de. ...
: 25. 08. 2021: handover of the offices of the…
: L ’de de a net par la de. ...
: L ’de de a net par la de. ...
: L ’de de a net par la de. ...
: L ’de de a net par la de. ...
: L ’de de a net par la de. ...
: L ’de de a net par la de. ...
: L ’de de a net par la de. ...
: L ’de de a net par la de. ...
: L ’de de a net par la de. ...
 L ’de a net by la de…
  Feeling Proud.  My ancestors came from diverse backgrounds. Part of my Mum’s family is from Tanzania or Tangany…
n ', manipulation. ...
 Zinauzwa pale Tanganyika arms. Unaweza kununua yako
This pose, this 200. [...] In 1996 , …
  BibiTiti was an influential leader during Tanzania's quest for independence under Tanganyika African National Union ( …
: we : . sound, there is on l. ...
 MAJALIWA: NIMERIDHISHWA NA UJENZI WA BANDARI YA KAREMA  Waziri Mkuu leo Agosti 25, 2021 amekagua maende…
 🤣🤣
: we : . sound, 

# spaCY relationship visualizer

In [None]:
displacy.render(nlp(tweets_sw_removal[-9]), style='dep', jupyter = True, options = {'distance': 120})
displacy.render(nlp(tweets_sw_removal[80]), style='dep', jupyter = True, options = {'distance': 120})

# spaCY

In [None]:
from pathlib import Path
from spacy import displacy

nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)

sentence_nlp = nlp(tweets_sw_removal[-9])
svg = displacy.render(sentence_nlp, style="dep")

output_path = Path("/content/dependency_plot.svg") # you can keep there only "dependency_plot.svg" if you want to save it in the same folder where you run the script 
output_path.open("w", encoding="utf-8").write(svg)

#svg = displacy.render(nlp(tweets_sw_removal[-9]), style='dep', jupyter = True, options = {'distance': 120})
#output_path = Path("/content/dependency_plot.svg")
#output_path.open("w", encoding="utf-8").write(svg)

15834

In [None]:
displacy.render(nlp(tweets_sw_removal[-9]), jupyter=True, style='ent')
#displacy.render(nlp(tweets_sw_removal[80]), jupyter=True, style='ent')

In [None]:
tweets_sw_removal[-9]

' The floodings of Lake Tanganyika and the Rusizi river have displaced thousands of people in Burundi. Urgent resources are…'

In [None]:
#article = ''

# articles

#for index in range(len(tweets_sw_removal)):
#  article = article.join(tweets_sw_removal[index])

#article 

In [None]:
# construct a list of nlp
nlp_list = []
for tweet in tweets_sw_removal:
  nlp_list.append(nlp(tweet))


In [None]:
# Counter 
items = []
labels = []
dict_nlp = {}
dict_entity = {}
for nlp_l in nlp_list:
  for item in nlp_l.ents:
    items.append(item.text)
    dict_nlp[str(item)] = item.label_
    labels.append(item.label_)

    if item.label_ in dict_entity.keys():
      temp_list = dict_entity[item.label_]
      temp_list.append(item.text)
      dict_entity[item.label_] = temp_list
    else:
      dict_entity[item.label_] = [item.text]


In [None]:
dict_entity['GPE']

['Tanzania', 'Tangany', 'Tanzania', 'Congo', 'Shungwaya', 'Tanzania', 'Uganda', 'Tanzania', 'Tanzania', 'Tanzania', 'Tanganyika', 'Tanzania', 'Zambia', 'Malawi', 'Mozambique', 'Lakes Mal', 'Tanzania', 'Tanganyika', 'Tanzania', 'Tangany', 'Tanzania', 'Tanganyika', 'Tanzania', 'Tanganyika', 'Tanzania', 'Tanganyika', 'Bantou', 'Rusizi', 'Burundi', 'London', 'UK', 'Rusizi', 'Burundi', 'Burundi', 'Burundi', 'Burundi', 'Kas', 'the Republic of Burundi', 'Rusizi', 'Burundi', 'VisitEastAfrica', 'Rusizi', 'Burundi', 'VisitEastAfrica', 'Rusizi', 'Burundi']

In [None]:
dict_entity['ORG']

['L ’de de a net par', 'L ’de de a net par', 'L ’de de a net par', 'L ’de de a net par', 'L ’de de a net par', 'L ’de de a net par', 'L ’de de a net par', 'L ’de de a net par', 'L ’de de a net par', 'L ’de de a net par', 'L ’de de a net par', 'L ’de de a net par', 'L ’de de a net par', "n '", 'Tanganyika African National Union', 'MAJALIWA', 'NIMERIDHISHWA NA', 'Waziri Mkuu', 'maende', 'hapana ni', 'MAJALIWA', 'NIMERIDHISHWA NA', 'Waziri Mkuu', 'maende', 'Tanganyika African National Union', 'Tanganyika African National Union', 'NIMERIDHISHWA NA', 'Waziri Mkuu', 'Tanganyika African National Union', 'na', 'irudisha nchi', 'Tanganyika African National Un', 'Wildlife', "de l '", "office s '", 'Thanks to Fund', 'the office sa', 'SAMBA']

In [None]:
dict_entity['GPE'] # GPE is geoplotical entity

['Tanzania', 'Tangany', 'Tanzania', 'Congo', 'Shungwaya', 'Tanzania', 'Uganda', 'Tanzania', 'Tanzania', 'Tanzania', 'Tanganyika', 'Tanzania', 'Zambia', 'Malawi', 'Mozambique', 'Lakes Mal', 'Tanzania', 'Tanganyika', 'Tanzania', 'Tangany', 'Tanzania', 'Tanganyika', 'Tanzania', 'Tanganyika', 'Tanzania', 'Tanganyika', 'Bantou', 'Rusizi', 'Burundi', 'London', 'UK', 'Rusizi', 'Burundi', 'Burundi', 'Burundi', 'Burundi', 'Kas', 'the Republic of Burundi', 'Rusizi', 'Burundi', 'VisitEastAfrica', 'Rusizi', 'Burundi', 'VisitEastAfrica', 'Rusizi', 'Burundi']

In [None]:
dict_nitem.label_

'ORG'

In [None]:
most_common_recognition = Counter(items).most_common(20)
for recognized in most_common_recognition:
  print(recognized)

('L ’de de a net par', 13)
('Tanzania', 12)
('Lake Tanganyika', 12)
('Burundi', 8)
('Tanganyika', 7)
('4', 7)
('Rusizi', 5)
('thousands', 5)
('Tanganyika African National Union', 4)
('Tan', 4)
('25', 3)
('08', 3)
('NIMERIDHISHWA NA', 3)
('BANDARI YA KAREMA', 3)
('Waziri Mkuu', 3)
('Agosti 25', 3)
('1', 3)
('Mum', 2)
('Tangany', 2)
('MAJALIWA', 2)


In [None]:
dict_nlp['Tanganyika African National Union']

'ORG'

In [None]:

Counter(labels)

Counter({'GPE': 46, 'ORG': 38, 'PERSON': 32, 'CARDINAL': 28, 'FAC': 11, 'DATE': 7, 'LOC': 6, 'PRODUCT': 5, 'NORP': 2, 'ORDINAL': 1, 'WORK_OF_ART': 1})

In [None]:
dict_entity['GPE']

['Burundi']

In [None]:
nlp(tweets_sw_removal[3])
displacy.render(nlp(tweets_sw_removal[3]), jupyter=True, style='ent')

In [None]:
sentences = [x for x in article.sents]
print(sentences[20])

In [None]:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})

In [None]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

In [None]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [None]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])

In [None]:
displacy.render(nlp(str(article)), jupyter=True, style='ent')

In [None]:
tweets_sw_removal = stopwords_removal_naive(raw_tweets_en)

In [None]:
tweets_sw_removal = stopwords_removal_naive(raw_tweets_en)