# TF-IDF + SVD + Random Forest + Gradient Boosting + Support Vector Classifier

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

def delete_hashs(df: pd.DataFrame) -> pd.DataFrame:
    df = df[~df['text'].astype(str).str.startswith('#')]
    df = df.reset_index(drop=True)
    return df

def encode_labels(df: pd.DataFrame) -> pd.DataFrame:
    for col in ['Joy', 'Trust', 'Anticipation', 'Surprise', 'Fear','Sadness', 'Disgust', 'Anger', 'Positive', 'Negative', 'Neutral']:
        df[col] = df[col].apply(lambda x: 1 if x else 0)
    return df

def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = df.columns.str.lower()
    return df

def delete_empty(df: pd.DataFrame) -> pd.DataFrame:
    df = df[df['text'].astype(str).str.len() > 2]
    df = df.reset_index(drop=True)
    return df

def embed_text(df: pd.DataFrame) -> pd.DataFrame:
    vectorizer = TfidfVectorizer()
    X_tfidf = vectorizer.fit_transform(df['text'].astype(str))
    
    # Redukcja wymiarowości do 100 (możesz zmienić n_components)
    svd = TruncatedSVD(n_components=100, random_state=42)
    X_svd = svd.fit_transform(X_tfidf)
    
    svd_df = pd.DataFrame(X_svd, columns=[f'svd_{i}' for i in range(X_svd.shape[1])])
    df = pd.concat([svd_df, df.reset_index(drop=True)], axis=1)
    return df

def transform_texts(df: pd.DataFrame) -> pd.DataFrame:
    result_df = delete_hashs(df=df)
    result_df = encode_labels(df=result_df)
    result_df = embed_text(df=result_df)
    result_df = rename_columns(df=result_df)
    return result_df

def transform_sentences(df: pd.DataFrame) -> pd.DataFrame:
    result_df = pd.DataFrame(data={
        'text': [],
        'Joy': [], 'Trust': [], 'Anticipation': [], 'Surprise': [], 'Fear': [], 'Sadness': [],
           'Disgust': [], 'Anger': [], 'Positive': [], 'Negative': [], 'Neutral': []
        }
    )
    
    sentences = []
    
    for index in df.index.tolist():
        if (str)(df.loc[index, 'text']).startswith('#'):
            sentence = " ".join(sentences)
            df.loc[index, 'text'] = sentence
            result_df = pd.concat([result_df, df.loc[[index]]])
            sentences = []
        else:
            sentences.append((str)(df.loc[index, 'text']))
            
    result_df = delete_hashs(df=result_df)
    result_df = delete_empty(df=result_df)
    result_df = encode_labels(df=result_df)
    result_df = embed_text(df=result_df)
    result_df = rename_columns(df=result_df)
    return result_df

In [2]:
import os
from typing import List

def load_data() -> List:
    data = []
    
    for type in ['train', 'val', 'test']:
        for category in ['texts', 'sentences']:
            if os.path.exists(f'../data/clean/ml_{type}_{category}.csv'):
                df = pd.read_csv(f'../data/clean/ml_{type}_{category}.csv', index_col=0)
            else:
                df = pd.read_csv(f'../data/raw/{type}.csv')
                if category == 'texts':
                    df = transform_texts(df=df)
                elif category == 'sentences':
                    df = transform_sentences(df=df)
                df.to_csv(f'../data/clean/ml_{type}_{category}.csv')

            data.append(df)        
    return data

In [3]:
data = load_data()

In [4]:
train_texts = data[0]
train_sentences = data[1]
val_texts = data[2]
val_sentences = data[3]
test_texts = data[4]
test_sentences = data[5]

In [5]:
train_texts = pd.concat([train_texts, val_texts], axis=0)
train_sentences = pd.concat([train_sentences, val_sentences], axis=0)

In [6]:
train_texts.shape

(7627, 112)

In [7]:
X_train_texts = train_texts.iloc[:, :100]

In [8]:
emotions = ['joy', 'trust', 'anticipation', 'surprise', 'fear','sadness', 'disgust', 'anger', 'positive', 'negative', 'neutral']
y_train = {}

for emotion in emotions:
    y_train[f'train_{emotion}'] = train_texts.loc[:, emotion]

In [9]:
y_train.keys()

dict_keys(['train_joy', 'train_trust', 'train_anticipation', 'train_surprise', 'train_fear', 'train_sadness', 'train_disgust', 'train_anger', 'train_positive', 'train_negative', 'train_neutral'])

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC

models = {
    'gradient_boosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5],
            'min_samples_split': [2, 5],
            'subsample': [0.8, 1.0]
        }
    },
    'svm': {
        'model': SVC(),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['rbf', 'linear'],
            'class_weight': ['balanced']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [10, 20, None],
            'class_weight': ['balanced']
        }
    }
}

In [11]:
from sklearn.metrics import make_scorer, precision_score, recall_score
import pickle

scoring = {
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted')
}

best_models = {}

for emotion in y_train.keys():
    print(f"\nProcessing emotion: {emotion}")
    best_score = 0
    best_model_name = None
    best_params = None
    
    for model_name, model_info in models.items():
        print(f"\nTrying {model_name}...")
        
        grid_search = GridSearchCV(
            estimator=model_info['model'],
            param_grid=model_info['params'],
            scoring=scoring,
            refit='recall',
            cv=5,
            n_jobs=-1,
            verbose=1
        )
        
        grid_search.fit(X_train_texts, y_train[emotion])
        
        current_score = grid_search.best_score_
        
        if current_score > best_score:
            best_score = current_score
            best_model_name = model_name
            best_params = grid_search.best_params_
            best_model = grid_search.best_estimator_
        
        print(f"Best {model_name} score: {current_score:.4f}")
        print(f"Best parameters: {grid_search.best_params_}")
    
    best_models[emotion] = {
        'model_type': best_model_name,
        'parameters': best_params,
        'model': best_model,
        'best_score': best_score
    }



Processing emotion: train_joy

Trying gradient_boosting...
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best gradient_boosting score: 0.6609
Best parameters: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 100, 'subsample': 0.8}

Trying svm...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best svm score: 0.6663
Best parameters: {'C': 1, 'class_weight': 'balanced', 'kernel': 'rbf'}

Trying random_forest...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best random_forest score: 0.6637
Best parameters: {'class_weight': 'balanced', 'max_depth': 20, 'n_estimators': 200}

Processing emotion: train_trust

Trying gradient_boosting...
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best gradient_boosting score: 0.7957
Best parameters: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 200, 'subsample': 1.0}

Trying svm...
Fitting 5 folds for each of 6 candidates, totalling 30 f

In [None]:
with open('../models/machine_learning.pkl', 'wb') as f:
    pickle.dump(best_models, f)

In [13]:
print("\nFinal Results:")
for emotion, results in best_models.items():
    print(f"\nEmotion: {emotion}")
    print(f"Best model: {results['model_type']}")
    print(f"Best parameters: {results['parameters']}")
    print(f"Best score: {results['best_score']:.4f}")


Final Results:

Emotion: train_joy
Best model: svm
Best parameters: {'C': 1, 'class_weight': 'balanced', 'kernel': 'rbf'}
Best score: 0.6663

Emotion: train_trust
Best model: random_forest
Best parameters: {'class_weight': 'balanced', 'max_depth': 10, 'n_estimators': 200}
Best score: 0.7961

Emotion: train_anticipation
Best model: random_forest
Best parameters: {'class_weight': 'balanced', 'max_depth': None, 'n_estimators': 200}
Best score: 0.8842

Emotion: train_surprise
Best model: gradient_boosting
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 0.8}
Best score: 0.9358

Emotion: train_fear
Best model: gradient_boosting
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 0.8}
Best score: 0.9592

Emotion: train_sadness
Best model: svm
Best parameters: {'C': 1, 'class_weight': 'balanced', 'kernel': 'rbf'}
Best score: 0.7003

Emotion: train_disgust
Best mode