In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
import re


In [None]:
from extract_dataframe import read_json         # a function to load json_data 
from extract_dataframe import TweetDfExtractor  # and a class to extract relevant variables.    
from clean_tweets_dataframe import Clean_Tweets        # collection of functions to for cleaning the dataframe.
#import dashboard_manager as ds_m   # imports a list of functions to create a dashboard
import database_management as db_m  # a list of functions to create a database, create tables, and manage data input-output.


load json 

In [None]:
_, tweet_list = read_json("data/world_twitter_data.json")
tweet = TweetDfExtractor(tweet_list)
tweet.get_tweet_df(True)  # this will also generate a CSV file.
print("Please, load the CSV file")

Load the CSV

In [None]:
tweet_df = pd.read_csv('processed_tweet_data.csv')  # loads the csv file created above.


In [None]:
tweet_df.shape

In [None]:
cleaner = Clean_Tweets(tweet_df)
tweet_df = cleaner.drop_unwanted_column(tweet_df)
tweet_df = cleaner.drop_duplicate(tweet_df)
tweet_df = cleaner.convert_to_datetime(tweet_df)
tweet_df = cleaner.convert_to_numbers(tweet_df)
clean_tweet_df = cleaner.remove_non_english_tweets(tweet_df)


In [None]:
clean_tweet_df.shape

In [None]:
clean_tweet_df.describe() 

In [None]:
# selecting only the relevant variables for further analysis
sho_tweet_df = clean_tweet_df.drop(['original_text', 'user_mentions', 'possibly_sensitive'], axis=1)
loc_tweet_df = sho_tweet_df.dropna() # droping rows with null values

#resetting indexes
sho_tweet_df = sho_tweet_df.reset_index();
sho_tweet_df = sho_tweet_df.drop(columns=['index'])

loc_tweet_df = loc_tweet_df.reset_index();
loc_tweet_df = loc_tweet_df.drop(columns=['index'])

print(clean_tweet_df.shape)
print(sho_tweet_df.shape)
print(loc_tweet_df.shape)

In [None]:
def sentiment_group (p): 
    if p > 0:
        return 'positive'
    elif p < 0:
        return 'negative'
    else:
        return 'neutral'

In [None]:
sentiment = pd.Series([sentiment_group(row_val) for row_val in list(loc_tweet_df['polarity'])])


In [None]:
sho_tweet_df = pd.concat([sho_tweet_df, sentiment.rename("sentiment")], axis=1)
loc_tweet_df = pd.concat([loc_tweet_df, sentiment.rename("sentiment")], axis=1)

sho_tweet_df.info()

In [None]:
loc_tweet_df.info()


#  Inserting To Database


In [None]:
# create a database if it doesn't exist
db_m.createDB(dbName='tweets')

# create a table if it doesn't exit
db_m.createTables(dbName='tweets')

# inserting data.
insert_to_tweet_table(dbName='tweets', df=loc_clean_df, table_name='TweetInformation')

In [None]:
loc_tweet_df["source"].value_counts().head(5).plot(kind="pie");

In [None]:
# loading countries basic information module and extracting african countries
from countries_info import countries
african_countries = []
for item in countries:
    if item['continent'] == 'Africa':
        african_countries.append(item['name'])
        african_countries.append(item['capital'])
    

# extracting countries from africa 
ava_countries = {}
for item in clean_tweet_df.location:
    if item in african_countries:
        if item not in ava_countries:
            ava_countries[item] = 1
        else:
            ava_countries[item] = ava_countries[item]+1

print("\ntweet_count summary by african countries\n")
# the count of tweets in association with each african countries.            
afr_df = pd.DataFrame ({ 'places': ava_countries.keys(), 'tweet_count': ava_countries.values()})
afr_df.sort_values(by=['tweet_count'], inplace=True, ascending = False)
afr_df['percentage'] = ((afr_df['tweet_count']/loc_tweet_df.shape[0])*100).round(1)
print("\n",afr_df, "\n")

# the tweet dataframe filtered by african countries
afr_list = list(ava_countries.keys())
afr_tweets_df = loc_tweet_df[loc_tweet_df['location'].isin(afr_list)]

print("tweet count summary by top 10 global countries\n")

top_loc = loc_tweet_df.groupby(['location']).size().sort_values(ascending=False).to_frame().reset_index().head(10)
top_loc.columns = ['places', 'tweet_count']
top_loc['percentage'] = ((top_loc['tweet_count']/loc_tweet_df.shape[0])*100).round(1)

print(top_loc)

In [None]:
labels = list(top_loc['places'])
values = list(top_loc['tweet_count'])

fig = plt.figure(figsize = (15, 5))
 
# creating the bar plot
plt.bar(labels, values, color ='purple',
        width = 0.4)
 
plt.xlabel("Locations")
plt.ylabel("Tweet Count")
plt.title("Tweets associated with locations")
plt.show()

In [None]:
gr_afr = afr_tweets_df.groupby(['location', 'sentiment']).size().to_frame().reset_index()
gr_afr.columns=['location','sentiment', 'count']
print(gr_afr)
# grouping by country and sentiment
#afr_tweets_df.groupby(['location', 'sentiment']).size().groupby(level=1).max() # grouping by country and sentiment

In [None]:
#filtering dataframe by top locations
top_loc_list = list(top_loc['places'])
top_loc_tweets_df = loc_tweet_df[loc_tweet_df['location'].isin(top_loc_list)]

#grouping by top locations and sentiment
gr_all = top_loc_tweets_df.groupby(['location', 'sentiment']).size().to_frame().reset_index()
gr_all.columns=['location','sentiment', 'count']
print(gr_all)

In [None]:
positive = len(loc_tweet_df[loc_tweet_df['sentiment'] == "positive"])
negative = len(loc_tweet_df[loc_tweet_df['sentiment'] == "negative"])
neutral = len(loc_tweet_df[loc_tweet_df['sentiment'] == "neutral"])

labels = ['neutral', 'positive', 'negative']
values = [negative, positive, neutral]

fig = plt.figure(figsize = (15, 5))
 
# creating the bar plot
plt.bar(labels, values, color ='orange',
        width = 0.4)
 
plt.xlabel("Sentiment")
plt.ylabel("Tweet Count")
plt.title("Tweets associated with Sentiment")
plt.show()

In [None]:
loc_tweet_df.groupby('sentiment')['clean_text'].count()


Sentiment analysis

In [None]:
len(sho_tweet_df['polarity'])


In [None]:
sho_tweet_df = sho_tweet_df.drop(sho_tweet_df[sho_tweet_df.sentiment == 'neutral'].index)
sho_tweet_df = sho_tweet_df.reset_index();
sho_tweet_df = sho_tweet_df.drop(columns=['index'])
sho_tweet_df.shape

In [None]:
valueArray = pd.Series([1 if row_val == 'positive' else 0 for row_val in sho_tweet_df['sentiment']])


In [None]:
sho_tweet_df['valueArray'] = valueArray
sho_tweet_df.shap

In [None]:
X = sho_tweet_df['clean_text']
y = sho_tweet_df['valueArray'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [None]:
print(X_train.size)
print(X_test.size)
print(y_train.size)
print(y_test.size)

In [None]:
count_vect = CountVectorizer()
X_train = X_train.replace(np.nan, '', regex=True)
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts = X_train_counts.toarray()

In [None]:
y_train = y_train.fillna(0)

In [None]:

y_train = y_train.fillna(0)
# scaling the input
clf = make_pipeline(StandardScaler(), SGDClassifier(max_iter=9000, tol=1e-2))

In [None]:
clf.fit(X_train_counts, y_train)


In [None]:
X_test = X_test.replace(np.nan, '', regex=True)
# use transform not fit_transform
X_test_counts = count_vect.transform(X_test)
X_test_counts = X_test_counts.toarray()
# prediction = clf.prevaluedict(X_test_counts)
prediction = clf.predict(X_test_counts)

In [None]:
prediction = clf.predict(X_test_counts)


In [None]:
np.mean(prediction == y_test)
