In [1]:
from os import getenv
from dotenv import load_dotenv, find_dotenv
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from pathlib import Path
from urllib.parse import quote 

def get_dot_env_file(ENV):
    if ENV:
        return Path.cwd().joinpath(ENV)
    return find_dotenv(Path.cwd().parent.joinpath(".env.dev"))

def read_credentails(file='.env'):
    """
    Return users credentials from the environnement variable
    raise a an exception if the credentials are empty

    Raises:
        ValueError: raise a value error if no credentials was found
    """
    dot_env_file = get_dot_env_file(file)
    load_dotenv(dotenv_path=dot_env_file, override=True)
    DATABASE_HOST = getenv("POSTGRES_HOST")
    print(f"the database host is {DATABASE_HOST}", 10 * "*==-")
    DATABASE_USER = getenv("POSTGRES_USER")
    DATABASE_PASSWORD = getenv("POSTGRES_PASSWORD")
    DATABASE_NAME = getenv("POSTGRES_DB")
    DATABASE_PORT = getenv('POSTGRES_PORT')
    all_variables = [DATABASE_HOST,
                     DATABASE_USER,
                     DATABASE_NAME,
                     DATABASE_PASSWORD,
                     DATABASE_PORT]
    if all(all_variables):
        return dict(zip(['database_host',
                         'database_user',
                         'database_name',
                         'database_password',
                         'database_port'], all_variables))
    else:
        raise ValueError(
            'Please add a .env file and put the credentials on it,\
                         refer to the sample')

In [2]:
def get_database_session(credentials):
    """
    Create a database session for database task

    Args:
        credentials (dict): credentials to use to connect to the db

    Returns:
        [tuple]: database session and the engine
    """
    database_url = 'postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}'\
        .format(user=credentials.get('database_user'),
                password=quote(credentials.get('database_password')),
                host=credentials.get('database_host'),
                database=credentials.get('database_name'),
                port=credentials.get('database_port'))
    print(database_url, "=="* 5)
    engine = create_engine(database_url)
    Session = sessionmaker(bind=engine)
    session = Session()
    return session, engine

In [3]:
from pathlib import Path
current_dir = Path().cwd()
credentials = read_credentails('.env')
print(credentials)
session, engine = get_database_session(credentials)

the database host is 89.40.12.167 *==-*==-*==-*==-*==-*==-*==-*==-*==-*==-
{'database_host': '89.40.12.167', 'database_user': 'es_py', 'database_name': 'tweets_analysis', 'database_password': '9874@Postgres', 'database_port': '5432'}


this part of the code is duplicated in the main file , i need to create a module for it .

In [4]:
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')
sql_query = f"select  raw_json->>'text' as tweet_text from tweet where date(created_at) = '{today}'"

In [55]:
sql_query

"select  raw_json->>'text' from tweet where date(created_at) = '2021-11-25'"

In [5]:
import pandas as pd
data = None
with engine.connect() as connection:
    data = pd.read_sql_query(sql=sql_query, con=connection, parse_dates=['created_at'])

In [6]:
data.head()

Unnamed: 0,tweet_text
0,Coordinateur·trice Santé - RDC - Kinshasa\nhtt...
1,@BleacherReport @KingJames Still the best on t...
2,@BenitaNtumba @fatshi13 @JacquesKyabula @kabun...
3,@NALAmoney @Benji_Fernandes Nala to DRC before...
4,@InterCronicas coragem kkkkk


#### cleaning  the text

in the part we will be performing the text cleaning using our deflaut text cleanner and using an new tweet tokenizer from huggingfacem

tweet_text    @BenitaNtumba @fatshi13 @JacquesKyabula @kabun...
Name: 4, dtype: object

array(['@Bleacherreport @Kingjames Still The Best On The Planet',
       '@Intercronicas Coragem Kkkkk',
       '@Kelly_Rdc Https://T.Co/N7Gbedqa7P', ...,
       '@Fernandoaquezad @Kelly_Rdc Ahhh... I Was Gonna Say. I Know Times Have Changed, But I Still Dig The Chicks, Bro.\n\nI… Https://T.Co/T5Mkmpvrm6',
       '@Innocentmirimo3 @Fatshi13 Kagame Se Sent Vraiment Chez Lui Après Le Départ De Jkk. Quelle Image Fatshi Va Laisser… Https://T.Co/Qms0Lkvh69',
       'My Father Is A Retired Soldier, His Last Mission Was In Drc, He Fought In The Liberation War, So Many Stories About… Https://T.Co/Dbtdjs1Tde'],
      dtype=object)

In [20]:
from tweets_cleaner.TweetsCleaner import TweetsCleaner
cleaner = TweetsCleaner('.')

In [21]:
data['cleaned_tweet'] = data['tweet_text'].apply(cleaner.prepocess_tweet, args=(True, True))

to explore for lemmatizaion

https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer

In [18]:
import unicodedata

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode('utf-8')

In [19]:
remove_accents("président felix antoine")

'president felix antoine'

In [23]:
data['cleaned_tweet'].sample(10)

13083    [felix, tshisekedi, accueille, chef, detat, je...
1678               [coffee, family, seeing, god, one, day]
8457     [remember, rumors, kagame, death, whole, popul...
3709     [nadie, dice, las, patovacacione, desacostumbr...
15158    [acho, minimo, poderia, fazer, depois, desser,...
2039     [shared, people, mak, initial, enquiry, text, ...
1099     [congratulation, raymond, who, key, turkey, bo...
3277                                                    []
3467     [echange, depolitisation, ndoto, baba, solutio...
13135              [cher, compatriote, president, mausole]
Name: cleaned_tweet, dtype: object

we can stay that a bit of the text cleaning part is done, but now we need to improve the lematizaion part, specially for nouns, for example Kinshasa is lematized as Kinshaser, not Kinshasa city. Checkout this [link](https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer) to see how the lematization is done.

Let now move to the sentiment analysis part of the project.

### Sentiment analysis


This is a case for sentiment analysis but we will not try to do it in a supervised way, we will try to do it in an unsupervised way. using the [Vader library](https://github.com/cjhutto/vaderSentiment) for french sentiment analysis.

For supervised sentiment analysis it worth exploring the BERT tool here [here](https://github.com/TheophileBlard/french-sentiment-analysis-with-bert) 

In [29]:
from vaderSentiment_fr.vaderSentiment import SentimentIntensityAnalyzer



In [25]:
sentiment_analyzer = SentimentIntensityAnalyzer()

In [28]:
sentiment_analyzer.polarity_scores(' '.join(data['cleaned_tweet'][3467]))

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}