In [1]:
from datetime import datetime, timedelta,timezone
import pandas as pd
import numpy  as np
from sklearn import set_config
import joblib

# from db import Model, Session, engine
# from models import Tweet, ProcessedTweet, Company



from custom_package.text_processing import normalize_text, tokenizer_func, remove_emojis
from custom_package.modeling import GensimLdaTransformer, get_topic_assignment, get_pos_sentiment_proba
from custom_package.modeling import topic_mapping_sk_lda, topic_mapping_gensim_lda, topic_mapping_sk_full_lda
from custom_package.database import get_raw_tweets, store_processed_tweets

In [2]:
set_config(display='diagram')

In [3]:
def get_filtered_tweets(query_limit = 100):
    raw_tweets = get_raw_tweets(query_limit)
    data = {'id' : [tweet.id for tweet in raw_tweets],
        'text' : [remove_emojis(tweet.text) for tweet in raw_tweets],
        'company_id' : [tweet.company_id for tweet in raw_tweets],
        'date' : [tweet.date for tweet in raw_tweets]
        }
    filtered_df = pd.DataFrame(data)
    return filtered_df

In [4]:
filtered_df = get_filtered_tweets()

In [5]:
filtered_df.head()

Unnamed: 0,id,text,company_id,date
0,e2025ffbfaab1fb2cb0d2b989b0da5b80b896c1c400f21...,@gtbank @gtbank_help Return my money Abeg!!! 5...,1,2023-01-11 08:23:00-05:00
1,5f93f87f8dc1a512bde089428190d7b0829052ffe868a3...,@gtbank_help This your customer care line be p...,1,2023-01-11 08:20:00-05:00
2,165dd482dd792047ee16fef14c278306aee4f903c354c0...,@gtbank_help I need help with the authorizatio...,1,2023-01-11 08:18:00-05:00
3,edcf21a701b43a125811690025d44be54b36c39b6bd371...,"So,u no see @ZenithBank ??",1,2023-01-11 08:17:00-05:00
4,d918053a7429c998f823dead2d8632a8ac40169d30bee1...,"Recently, some banks in Nigeria suspended inte...",1,2023-01-11 08:16:00-05:00


In [6]:
sk_lda_pipeline = joblib.load('sklearn_LDA_pipeline.joblib')

In [7]:
print(sk_lda_pipeline)

Pipeline(steps=[('preprocessor',
                 Pipeline(steps=[('normalizer',
                                  FunctionTransformer(func=<function normalize_text at 0x00000124B7D14AF0>)),
                                 ('vectorizer',
                                  CountVectorizer(max_df=0.95,
                                                  max_features=5000, min_df=50,
                                                  ngram_range=(1, 3),
                                                  stop_words=['nothing',
                                                              'forty', 'beyond',
                                                              'yours',
                                                              'however',
                                                              'fifty', 'seems',
                                                              'meanwhile', 'be',
                                                              'well', 'put',
            

In [8]:
gensim_lda_pipeline = joblib.load('gensim_LDA_pipeline.joblib')

In [9]:
print(gensim_lda_pipeline)

Pipeline(steps=[('normalizer',
                 FunctionTransformer(func=<function normalize_text at 0x00000124B7D14AF0>)),
                ('tokenizer',
                 FunctionTransformer(func=<function tokenizer_func at 0x00000124C2E44940>)),
                ('model',
                 GensimLdaTransformer(gensim_dictionary=<gensim.corpora.dictionary.Dictionary object at 0x00000124C7CA5D00>,
                                      gensim_model=<gensim.models.ldamodel.LdaModel object at 0x00000124C68B07C0>))])


In [10]:
sk_full_lda_pipeline = joblib.load('full_lda_pipeline.joblib')

In [11]:
print(sk_full_lda_pipeline)

Pipeline(steps=[('preprocessor',
                 Pipeline(steps=[('normalizer',
                                  FunctionTransformer(func=<function normalize_text at 0x00000124B7D14AF0>)),
                                 ('vectorizer',
                                  CountVectorizer(max_df=0.95,
                                                  max_features=10000,
                                                  min_df=100,
                                                  ngram_range=(1, 3),
                                                  stop_words=['fill', 'many',
                                                              'their', 'which',
                                                              'may', 'becomes',
                                                              'somehow', 'with',
                                                              'yet', 'each',
                                                              'or', 'several',
                     

In [12]:
sentiment_analysis_pipeline = joblib.load('sentiment_analysis_pipeline.joblib')

In [13]:
print(sentiment_analysis_pipeline)

<transformers.pipelines.text_classification.TextClassificationPipeline object at 0x00000124C8200AC0>


In [14]:
#filtered_df = pd.read_csv('gensim_topic.csv',usecols=['index','text','date','Sentiment'])

In [15]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype                    
---  ------      --------------  -----                    
 0   id          100 non-null    object                   
 1   text        100 non-null    object                   
 2   company_id  100 non-null    int64                    
 3   date        100 non-null    datetime64[ns, UTC-05:00]
dtypes: datetime64[ns, UTC-05:00](1), int64(1), object(2)
memory usage: 3.2+ KB


In [16]:
check_text = filtered_df['text'].iloc[0:10].to_list()

In [17]:
print(sk_full_lda_pipeline.transform(check_text).round(3))

[[0.001 0.001 0.001 0.505 0.001 0.001 0.001 0.484 0.001 0.001]
 [0.002 0.985 0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.002]
 [0.002 0.002 0.74  0.002 0.245 0.002 0.002 0.002 0.002 0.002]
 [0.1   0.1   0.1   0.1   0.1   0.1   0.1   0.1   0.1   0.1  ]
 [0.001 0.001 0.001 0.001 0.287 0.192 0.001 0.001 0.512 0.001]
 [0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.987 0.001 0.001]
 [0.009 0.009 0.009 0.009 0.009 0.009 0.009 0.918 0.009 0.009]
 [0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.989 0.001 0.001]
 [0.005 0.005 0.005 0.432 0.005 0.005 0.005 0.53  0.005 0.005]
 [0.001 0.001 0.001 0.001 0.001 0.001 0.989 0.001 0.001 0.001]]


In [18]:
print(sk_lda_pipeline.transform(check_text).round(3))

[[0.001 0.001 0.001 0.001 0.001 0.001 0.34  0.001 0.001 0.652]
 [0.001 0.001 0.256 0.001 0.001 0.001 0.001 0.001 0.733 0.001]
 [0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.982 0.002 0.002]
 [0.1   0.1   0.1   0.1   0.1   0.1   0.1   0.1   0.1   0.1  ]
 [0.002 0.002 0.985 0.002 0.002 0.002 0.002 0.002 0.002 0.002]
 [0.001 0.001 0.466 0.001 0.001 0.001 0.001 0.523 0.001 0.001]
 [0.002 0.002 0.978 0.002 0.002 0.002 0.002 0.002 0.002 0.002]
 [0.002 0.643 0.341 0.002 0.002 0.002 0.002 0.002 0.002 0.002]
 [0.005 0.005 0.957 0.005 0.005 0.005 0.005 0.005 0.005 0.005]
 [0.001 0.398 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.592]]


In [19]:
print(gensim_lda_pipeline.transform(check_text))

[[0.25707287, 0.008635622, 0.0069318595, 0.6099758, 0.00499411, 0.004382017, 0.0039037492, 0.003519588, 0.09764353, 0.0029408142], [0.31591144, 0.014110094, 0.011342628, 0.00950143, 0.61975914, 0.0071710274, 0.0063883103, 0.0057596774, 0.005243681, 0.0048125377], [0.5522312, 0.36745444, 0.015613755, 0.013066233, 0.011239155, 0.009861208, 0.008784889, 0.007920405, 0.007210829, 0.0066179433], [0.20349775, 0.1546068, 0.12465746, 0.10442833, 0.08984802, 0.0788403, 0.07023542, 0.063324034, 0.057650995, 0.052910846], [0.68201125, 0.011113677, 0.008913455, 0.0074663796, 0.0064212866, 0.0056344178, 0.0050194124, 0.26551872, 0.0041200514, 0.0037812942], [0.6264858, 0.012923171, 0.010392082, 0.008708214, 0.007488664, 0.006570523, 0.005853423, 0.0052773957, 0.0048046047, 0.31149614], [0.051288467, 0.0387197, 0.031166764, 0.7756181, 0.022464566, 0.01971058, 0.017559294, 0.015831403, 0.014413106, 0.013228039], [0.70442873, 0.012979389, 0.01040398, 0.008714718, 0.007493435, 0.0065746345, 0.005857018

In [20]:
sk_result = sk_lda_pipeline.transform(check_text).round(3)
sk_full_result = sk_full_lda_pipeline.transform(check_text).round(3)
gensim_result = gensim_lda_pipeline.transform(check_text)
sentiment_result = sentiment_analysis_pipeline(check_text)

In [21]:
for res in sentiment_result:
    print(get_pos_sentiment_proba(res))

0.040087420493364334
0.027732184855267406
0.43706795386970043
0.4269000133499503
0.33365124091506004
0.0405961733777076
0.05149565334431827
0.020634160144254565
0.15348927164450288
0.03800583933480084


In [22]:
for index,text in enumerate(check_text):
    print(index + 1)
    print(text)

1
@gtbank @gtbank_help Return my money Abeg!!! 5 working days don pass o and no refund and to cap it off your rubbish WhatsApp portal Is non responsive  I’ve never seen a more unreachable bank,it’s shameless
2
@gtbank_help This your customer care line be playing music like we called to attend a concert. Very disappointed...
3
@gtbank_help I need help with the authorization code to set up my soft token.
4
So,u no see @ZenithBank ??
5
Recently, some banks in Nigeria suspended international transactions with naira card. In this Explainer, I spoke with experts on the implications of the decision on businesses and the Nigerian people.  https://dailytrust.com/explainer-how-suspension-of-intl-transactions-on-naira-cards-will-affect-nigerians/ @gtbank @FirstBankngr @cenbank @NGRPresident
6
@gtbank_help you people are scam , just got to one of your branch and they told me that this is not your page and numbers you’ve been telling to chat up are not in anyway your agents , @gtbank  is a fraud
7


In [23]:
for index,res in enumerate(sk_full_result):
    print(index + 1)
    print(get_topic_assignment(res, topic_mapping_sk_full_lda))

1
Electronic Banking - Transaction Errors
2
General Complaints
3
Transactions - Value Added Services
4
Unclassified
5
Transaction Issues - General
6
Electronic Banking - Complaints & Fraud Reports
7
Electronic Banking - Complaints & Fraud Reports
8
Electronic Banking - Complaints & Fraud Reports
9
Electronic Banking - Complaints & Fraud Reports
10
Account Related Issues


In [24]:
for index, res in enumerate(sk_result):
    print(index + 1)
    print(get_topic_assignment(res, topic_mapping_sk_lda))

1
Fraud
2
Transaction Related Issue
3
Mobile Banking
4
Unclassified
5
General Complaints
6
Mobile Banking
7
General Complaints
8
Accounted Related Queries
9
General Complaints
10
Fraud


In [25]:
for index, res in enumerate(gensim_result):
    print(index + 1)
    print(get_topic_assignment(res, topic_mapping_gensim_lda))

1
Customer Service Feedback
2
Physical Branch
3
Fraud
4
Unclassified
5
Fraud
6
Fraud
7
Customer Service Feedback
8
Fraud
9
Physical Branch
10
Fraud


In [26]:
def apply_topic_to_df(filtered_df):
    sentiment_result = sentiment_analysis_pipeline(filtered_df['text'].to_list())
    sentiment_result = [get_pos_sentiment_proba(array) for array in sentiment_result]
    filtered_df['sentiment'] = sentiment_result
    sk_full_result = sk_full_lda_pipeline.transform(filtered_df['text'])
    sk_full_result = [get_topic_assignment(array,topic_mapping_sk_full_lda) for array in sk_full_result]
    filtered_df['sk_full_topic'] = sk_full_result
    sk_result = sk_lda_pipeline.transform(filtered_df['text'])
    sk_result = [get_topic_assignment(array,topic_mapping_sk_lda) for array in sk_result]
    filtered_df['sk_topic'] = sk_result
    gensim_result = gensim_lda_pipeline.transform(filtered_df['text'])
    gensim_result = [get_topic_assignment(array,topic_mapping_gensim_lda) for array in gensim_result]
    filtered_df['gensim_topic'] = gensim_result
    return filtered_df

In [27]:
filtered_df = apply_topic_to_df(filtered_df)

In [28]:
filtered_df.head()

Unnamed: 0,id,text,company_id,date,sentiment,sk_full_topic,sk_topic,gensim_topic
0,e2025ffbfaab1fb2cb0d2b989b0da5b80b896c1c400f21...,@gtbank @gtbank_help Return my money Abeg!!! 5...,1,2023-01-11 08:23:00-05:00,0.040087,Electronic Banking - Transaction Errors,Fraud,Customer Service Feedback
1,5f93f87f8dc1a512bde089428190d7b0829052ffe868a3...,@gtbank_help This your customer care line be p...,1,2023-01-11 08:20:00-05:00,0.027732,General Complaints,Transaction Related Issue,Physical Branch
2,165dd482dd792047ee16fef14c278306aee4f903c354c0...,@gtbank_help I need help with the authorizatio...,1,2023-01-11 08:18:00-05:00,0.437068,Transactions - Value Added Services,Mobile Banking,Fraud
3,edcf21a701b43a125811690025d44be54b36c39b6bd371...,"So,u no see @ZenithBank ??",1,2023-01-11 08:17:00-05:00,0.4269,Unclassified,Unclassified,Unclassified
4,d918053a7429c998f823dead2d8632a8ac40169d30bee1...,"Recently, some banks in Nigeria suspended inte...",1,2023-01-11 08:16:00-05:00,0.333651,Transaction Issues - General,General Complaints,Fraud


In [21]:
# def store_processed_tweets(filtered_df):
#     data = filtered_df.to_dict(orient='records')
#     with Session() as session:
#     for item in data:
#         processed_tweet = ProcessedTweet(**item)
#         try:
#             session.add(processed_tweet)
#             session.commit()
#         except Exception as e:
#             session.rollback()
#             print(e)

In [29]:
store_processed_tweets(filtered_df)

In [30]:
store_processed_tweets(apply_topic_to_df(get_filtered_tweets()))

In [31]:
for i in range(round(185)):
    try:
        store_processed_tweets(apply_topic_to_df(get_filtered_tweets(1000)))
    except Exception as e:
        print(e)

The expanded size of the tensor (809) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 809].  Tensor sizes: [1, 514]
The expanded size of the tensor (544) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 544].  Tensor sizes: [1, 514]
The expanded size of the tensor (596) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 596].  Tensor sizes: [1, 514]
The expanded size of the tensor (573) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 573].  Tensor sizes: [1, 514]
The expanded size of the tensor (809) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 809].  Tensor sizes: [1, 514]
The expanded size of the tensor (544) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 544].  Tensor sizes: [1, 514]
The expanded size of the tensor (596) must match the existing size (514) at non-singleto

In [23]:
# with Session() as session:
#     for item in data:
#         processed_tweet = ProcessedTweet(**item)
#         try:
#             session.add(processed_tweet)
#             session.commit()
#         except Exception as e:
#             session.rollback()
#             print(e)

In [24]:
sk_result = sk_lda_pipeline.transform(filtered_df['text'])

In [25]:
sk_result = list(sk_result)

In [38]:
sk_result[40-1]

array([0.00243902, 0.25670971, 0.7237781 , 0.00243902, 0.00243902,
       0.00243902, 0.00243902, 0.00243902, 0.00243902, 0.00243902])

In [41]:
filtered_df['text'].iloc[38:40]

38    @gtbank_help how can I put my email to my acco...
39    @gtbank_help please how I put my email to my a...
Name: text, dtype: object

In [40]:
normalize_text(filtered_df['text'].iloc[38:40])

array(['email account whitout bank', 'email account Whitout enter bank'],
      dtype='<U32')