In [1]:
import pandas as pd
# import dask.dataframe as dd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
import logging
import re
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer

[nltk_data] Downloading package punkt to /home/benjamin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/benjamin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv(
    '/home/benjamin/Documents/OpenClassroomsDatasets/sentiment/sentiment140/training.1600000.processed.noemoticon.csv',
    encoding = "ISO-8859-1",
    names=["target", "id", "date", "flag", "user", "text"]
)

In [None]:
data.info()

In [3]:
data = data.drop(columns=["id", "date", "flag", "user"])

In [4]:
data.target = data.target.map(
    {
        0: "NEGATIVE",
        2: "NEUTRAL",
        4: "POSITIVE",
    }
)

In [5]:
def sample_equal_classes(df, n_pos=100000, n_neg=100000):
    df_pos = df[df["target"] == "POSITIVE"].sample(n=n_pos)
    df_neg = df[df["target"] == "NEGATIVE"].sample(n=n_neg)
    
    return pd.concat([df_pos, df_neg]).reset_index(drop=True)
sampled_df = sample_equal_classes(data)

In [6]:
def reduce_dataframe_memory_usage(
    df: pd.DataFrame,
    high_precision: bool = False,
) -> pd.DataFrame:
    """
    Iterate through all the columns of a dataframe and modify the data type to
    reduce memory usage.

    Args:
        df (pd.DataFrame): dataframe to reduce memory usage.
        high_precision (bool): If True, use 64-bit floats instead of 32-bit

    Returns:
        pd.DataFrame: dataframe with reduced memory usage.
    """
    start_mem = round(df.memory_usage().sum() / 1024**2, 2)
    logging.info("Memory usage of dataframe is %d MB", start_mem)

    # Iterate through columns
    for col in df.columns:
        if df[col].dtype == "object":
            # "object" dtype
            if df[col].nunique() < max(100, df.shape[0] / 100):
                # If number of unique values is less than max(100, 1%)
                df[col] = df[col].astype("category")
            else:
                # If number of unique values is greater than max(100, 1%)
                df[col] = df[col].astype("string")

        elif str(df[col].dtype)[:3] == "int":
            # "int" dtype
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                df[col] = df[col].astype("UInt8")
            elif c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype("Int8")
            elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                df[col] = df[col].astype("UInt16")
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype("Int16")
            elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                df[col] = df[col].astype("UInt32")
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype("Int32")
            elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
                df[col] = df[col].astype("UInt64")
            else:
                df[col] = df[col].astype("Int64")

        elif str(df[col].dtype)[:5] == "float":
            # "float" dtype
            c_min = df[col].min()
            c_max = df[col].max()
            if (
                not high_precision
                and c_min > np.finfo(np.float32).min
                and c_max < np.finfo(np.float32).max
            ):
                df[col] = df[col].astype("float32")
            else:
                df[col] = df[col].astype("float64")

    end_mem = round(df.memory_usage().sum() / 1024**2, 2)
    logging.info("Memory usage after optimization is %d MB", end_mem)
    if start_mem > 0:
        logging.info(
            "Decreased by %d %%", round(100 * (start_mem - end_mem) / start_mem)
        )

    return df

sampled_df = reduce_dataframe_memory_usage(sampled_df)

In [7]:
def tweeter(sentence):
    stemmer = PorterStemmer()
    tk = TweetTokenizer(preserve_case=False, reduce_len=True)
    tok_sent = tk.tokenize(sentence)
    stop_words = set(stopwords.words('english'))
    text = [stemmer.stem(word.lower()) 
            for word in tok_sent 
            if word not in stop_words
            and word.isalpha()==True]
    sent=""
    for word in text:
        sent+=word+" "
    return sent[:-1]

In [8]:
sampled_df["text"] = sampled_df["text"].apply(lambda x:tweeter(x))
sampled_df.head()

Unnamed: 0,target,text
0,POSITIVE,go shop
1,POSITIVE,oh movi sooo much wonder although need grovel end
2,POSITIVE,use tryin get shape
3,POSITIVE,thank respons ye inde figur respons trial erro...
4,POSITIVE,yep enjoy movi


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

tf = TfidfVectorizer(min_df=0.001)

X = tf.fit_transform(sampled_df['text']).toarray()
y = sampled_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape,X_test.shape

((160000, 959), (40000, 959))

In [10]:
# Initialize Logistic Regression model
logistic_model = LogisticRegression()

# Train the model
logistic_model.fit(X_train, y_train)

In [11]:
# Make predictions on the test set
y_pred = logistic_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.745075
Confusion Matrix:
 [[14273  5734]
 [ 4463 15530]]
Classification Report:
               precision    recall  f1-score   support

    NEGATIVE       0.76      0.71      0.74     20007
    POSITIVE       0.73      0.78      0.75     19993

    accuracy                           0.75     40000
   macro avg       0.75      0.75      0.74     40000
weighted avg       0.75      0.75      0.74     40000



In [17]:
# Example new text data
new_text = ["This is an example sentence.",
            "What a stupid idea !",
            "Truly incredible comeback !"]

# Preprocess and transform the new text data to TF-IDF features
new_text_processed = [tweeter(text) for text in new_text]
new_text_tfidf = tf.transform(new_text_processed).toarray()

# Make predictions
predictions = logistic_model.predict(new_text_tfidf)
print("Predictions:", predictions)

Predictions: ['POSITIVE' 'NEGATIVE' 'POSITIVE']


In [14]:
import pickle
pickle_out = open("log_reg.pkl","wb")
pickle.dump(logistic_model, pickle_out)
pickle_out.close()

In [33]:
pickle_out = open("tfidf.pkl","wb")
pickle.dump(tf, pickle_out)
pickle_out.close()

In [40]:
#test API
import requests

test = "Incredibly done !"
url = "http://127.0.0.1:8000/predict"
headers = {"Content-Type": "application/json"}
body = {
    "text" : test
}

response = requests.post(url, json=body, headers=headers)


if response.status_code == 200:
    print(response.text)
else:
    print(f"Erreur : {response.status_code} : {response.reason}")
    

{"Prediction":"POSITIVE"}
