In [1]:
# Libraries for data loading, data manipulation and data visulisation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import chardet # To provide a best estimate of the encoding that was used in the text data
import io # For string operations
%matplotlib inline

# Libraries for data preparation and model building
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize, TweetTokenizer, TreebankWordTokenizer
from nltk.corpus import stopwords
import string
import datetime
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import math
import re
from sklearn.utils import resample
from scipy.sparse import vstack
from scipy.sparse import csr_matrix
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn import feature_selection
from sklearn.feature_selection import SelectPercentile, SelectKBest, f_classif, mutual_info_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, log_loss
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from sklearn.model_selection import cross_val_predict

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE,SMOTENC
from imblearn.pipeline import Pipeline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
# LOAD DATASET

def transform_categorical_labels(data):
    # Dictionary mapping numerical categories to labels
    label_map = {
        2: 'News',
        1: 'Pro',
        0: 'Neutral',
        -1: 'Anti'
    }
    transformed_data = data.map(label_map)
    return transformed_data

def class_distribution(data):
    if isinstance(data, pd.DataFrame):
        unique_classes, class_counts = data.iloc[:, 0].value_counts().index, data.iloc[:, 0].value_counts().values
    elif isinstance(data, pd.Series):
        unique_classes, class_counts = data.value_counts().index, data.value_counts().values
    class_dict = dict(zip(unique_classes, class_counts))
    return class_dict

def load_dataset(environment, size):
    if environment == 'colab':
        from google.colab import drive
        drive.mount('/content/drive')
        csv_file = '/content/drive/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Sprint 6_Advanced Classification/Predict/advanced-classification-predict/data/train.csv'
    else:
        csv_file = r'G:\My Drive\Professionele ontwikkeling\Data Science\Explore Data Science Course\Sprint 6_Advanced Classification\Predict\advanced-classification-predict\data\train.csv'
    df = pd.read_csv(csv_file)
    #df['sentiment'] = transform_categorical_labels(df['sentiment'])
    print(f'Dataset original shape: {df.shape}')
    print(f'Dataset original class distribution: {class_distribution(df)}')
    sample_size = int(len(df) * size)
    if size == 1:
        pass
    else:
        X = df.drop(columns=['sentiment']).copy()
        y = df.sentiment.copy()
        X_sample, _, y_sample, _ = train_test_split(X, y, train_size=sample_size, stratify=y, random_state=42)
        df = pd.concat([X_sample, y_sample], axis=1)

#       np.random.seed(42)  # OLD CODE
#       pd.read_csv(csv_file).shape[0] - n  # OLD CODE
#        df = pd.read_csv(csv_file, skiprows=lambda i: i > 0 and np.random.rand() > n / (i + 1), nrows=n)  # OLD CODE

    class_dict = class_distribution(df['sentiment'])

    return df, sample_size, class_dict


# def load_full_dataset():  # OLD CODE
#   df = pd.read_csv(csv_file)  # OLD CODE
#   return df, 'full'  # OLD CODE


df_train = pd.read_csv('G:/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Sprint 6_Advanced Classification/Predict/advanced-classification-predict/data/train.csv')
pd.set_option('display.max_colwidth', None)
df_train.head(10)

# Step 1: Split into train / test
X_train, X_test, y_train, y_test = train_test_split(df_train['message'], df_train['sentiment'], test_size=0.2, random_state=42)


class SampleSelector(BaseEstimator, TransformerMixin):
    def __init__(self, sample_percentage=1.0, stratify='y', random_state=42):
        self.sample_percentage = sample_percentage
        self.stratify = stratify
        self.random_state = random_state

    def fit(self, X, y):
        return self

    def transform(self, X, y=None):
        # Set random seed for reproducibility
        np.random.seed(self.random_state)
        if self.sample_percentage != 1.0:
            print(X.shape)
            print(y.shape)
            print(f'y:')
            print(y.iloc[0])
            # Determine the unique classes in y and their frequencies
            unique_classes, class_counts = np.unique(y, return_counts=True)
            # Determine the number of samples to select for each class
            num_samples_per_class = (class_counts * self.sample_percentage).astype(int)
            # Initialize an empty list to store the sampled indices
            print(unique_classes)
            print(class_counts)
            sampled_indices = []

            # Iterate over each unique class
            for cls, num_samples in zip(unique_classes, num_samples_per_class):
                # Get the indices of samples belonging to the current class
                class_indices = np.where(y == cls)[0]
                # Randomly select samples from the current class
                sampled_indices.extend(np.random.choice(class_indices, size=num_samples, replace=False))

            print(sampled_indices)
            X_sampled = X.iloc[sampled_indices]
            y_sampled = y.iloc[sampled_indices]

        else:
            X_sampled = X
            y_sampled = y

        # Return the randomly sampled subset

        print(type(X_sampled))
        print(f'X_sampled[0]: {X_sampled.iloc[0]}')
        print(f'y_sampled[0]: {y_sampled.iloc[0]}')
        return X_sampled, y_sampled

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        X_sampled = self.transform(X, y)
        return X_sampled, y_sampled

In [3]:
class NoiseRemover(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y):
        return self

    def transform(self, X, y=None):
        pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'   # Find all hyperlinks
        subs_url = r''
        X_transformed = pd.Series(X).replace(to_replace = pattern_url, value = subs_url, regex = True)
        return X_transformed

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

In [4]:
class EmoticonConverter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y):
        return self

    def transform(self, X, y=None):
        emoticon_dictionary = {':\)': 'smiley_face_emoticon',
                               ':\(': 'frowning_face_emoticon',
                               ':D': 'grinning_face_emoticon',
                               ':P': 'sticking_out_tongue_emoticon',
                               ';\)': 'winking_face_emoticon',
                               ':o': 'surprised_face_emoticon',
                               ':\|': 'neutral_face_emoticon',
                               ':\'\)': 'tears_of_joy_emoticon',
                               ':\'\(': 'crying_face_emoticon'}
        X_transformed = X.replace(emoticon_dictionary, regex=True)
        return X_transformed

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

In [5]:
class PunctuationRemover(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y):
        return self

    def transform(self, X, y=None):
        def expand_contractions(text):
            contractions = {"'t": " not","'s": " is","'re": " are","'ll": " will", "'m": " am"}
            pattern = re.compile(r"\b(" + "|".join(re.escape(key) for key in contractions.keys()) + r")\b")
            text = re.sub(r"n't\b", " not", text) # Replace "n't" with " not"
            text = pattern.sub(lambda match: contractions[match.group(0)], text) # Replace all other contractions except for "n't"
            return text

        def remove_punctuation(text):
            return ''.join([l for l in text if l not in string.punctuation])

        X_transformed = X.apply(lambda x: expand_contractions(x))
        X_transformed = X_transformed.apply(lambda x: remove_punctuation(x))
        return X_transformed

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

In [6]:
class Tokenizer(BaseEstimator, TransformerMixin):
    def __init__(self, type=None):
        self.type = type

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if self.type == 'TweetTokenizer':
            tokenizer = TweetTokenizer()
        else:
            tokenizer = TreebankWordTokenizer()
        X_transformed = X.apply(tokenizer.tokenize)
        return X_transformed

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

In [7]:
class StopwordRemover(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        stop_words = set(stopwords.words('english'))
        # Remove stopwords using a vectorized operation
        X_transformed = X.apply(lambda tokens: [t for t in tokens if t.lower() not in stop_words])
        return X_transformed

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

In [8]:
class Lemmatizer(BaseEstimator, TransformerMixin):
    def __init__(self, pos='v'):
        self.pos = pos

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        lemmatizer = WordNetLemmatizer()
        X_transformed = X.apply(lambda tokens: [lemmatizer.lemmatize(word, pos=self.pos) for word in tokens])
        return X_transformed

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

In [9]:
class Vectorize(BaseEstimator, TransformerMixin):
    def __init__(self, type='tfidf', max_df=1, min_df=1, ngram_range=(1,1), max_features=None):
        self.type = type
        self.max_df = max_df
        self.min_df = min_df
        self.ngram_range = ngram_range
        self.max_features = max_features

        if self.type == 'count':
            self.vectorizer = CountVectorizer(max_features=self.max_features, lowercase=True,
                                              max_df=self.max_df, min_df=self.min_df,
                                              ngram_range=self.ngram_range)
        elif self.type == 'tfidf':
            self.vectorizer = TfidfVectorizer(max_features=self.max_features, lowercase=True,
                                              max_df=self.max_df, min_df=self.min_df,
                                              ngram_range=self.ngram_range)
        else:
            raise ValueError("Invalid vectorizer type. Choose either 'count' or 'tfidf'")

    def fit(self, X, y=None):
        X_joined = [' '.join(tokens) for tokens in X]
        self.vectorizer.fit(X_joined)
        return self.vectorizer.fit(X_joined)

    def transform(self, X, y=None):
        X_joined = [' '.join(tokens) for tokens in X]
        return self.vectorizer.transform(X_joined)

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

class Resampler(BaseEstimator, TransformerMixin):
    def __init__(self, technique='balanced', random_state=42):
        self.technique = technique
        self.random_state = random_state

    def fit(self, X, y):
        return self

    def transform(self, X, y=None):
        if self.technique == 'balanced':
            class_labels, counts = np.unique(y, return_counts=True)
            n_classes = class_labels.shape[0]
            balanced_freq = X.shape[0] / n_classes  # Get the number of samples from the shape of X
            X_resampled, y_resampled = [], []
            for class_label, count in zip(class_labels, counts):
                indices = np.atleast_1d(np.where(y == class_label)[0])
                if count > balanced_freq:
                    resampled_indices = resample(indices, replace=False, n_samples=int(balanced_freq), random_state=self.random_state)
                elif count < balanced_freq:
                    resampled_indices = resample(indices, replace=True, n_samples=int(balanced_freq), random_state=self.random_state)
                else:
                    resampled_indices = indices
                if len(resampled_indices) > 0:
                    resampled_X = X[resampled_indices]
                    X_resampled.append(resampled_X)
                    y_resampled.extend([class_label] * resampled_X.shape[0])

            if X_resampled:
                X_resampled = vstack(X_resampled)
            else:
                X_resampled = csr_matrix((0, X.shape[1]))  # Create an empty sparse matrix
            y_resampled = np.array(y_resampled)

        elif self.technique == 'smote':
            sampler = SMOTE(random_state=self.random_state)
            X_dense = X.toarray()
            X_resampled, y_resampled = sampler.fit_resample(X_dense, y)
            X_resampled = csr_matrix(X_resampled)

        else:
            raise ValueError("Invalid resampling technique.  Choose either 'random' or 'smote'")

        return X_resampled, y_resampled

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

In [10]:
class Scaler(BaseEstimator, TransformerMixin):
    def __init__(self, type='robust'):
        self.type = type

        if self.type == 'robust':
            self.scaler = RobustScaler(with_centering=False)
        elif self.type == 'minmax':
            self.scaler = MinMaxScaler(with_centering=False)
        elif self.type == 'maxabs':
            self.scaler = MaxAbsScaler(with_centering=False)
        else:
            raise ValueError("Invalid scaler type. Choose between 'robust', 'minmax' or 'maxabs'.")

    def fit(self, X, y=None):
        self.scaler.fit(X)
        return self

    def transform(self, X, y=None):
        return self.scaler.transform(X)

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

In [11]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, type='anova_f', percentile=50):
        self.type = type
        self.percentile = percentile

        if self.type == 'mutualinfo':
            self.selector = SelectPercentile(score_func=mutual_info_classif, percentile=percentile)
        elif self.type == 'anova_f':
            self.selector = SelectPercentile(score_func=f_classif, percentile=percentile)
        else:
            raise ValueError("Invalid selector type. Choose between 'mutualinfo' or 'anova_f'.")

    def fit(self, X, y=None):
        self.selector.fit(X, y)
        return self

    def transform(self, X, y=None):
        return self.selector.transform(X)

    def fit_transform(self, X, y=None):
        self.selector.fit(X, y)
        return self.selector.transform(X)

In [12]:
df_train, size, class_dictionary = load_dataset('colab',0.25)
# Next section is to calculate class imbalance, for us in randomover and randomundersampling

number_classes = len(class_dictionary)
majority = max(class_dictionary.values())
average_class_count = int(size / number_classes)
oversampling_strategy = {key: max(average_class_count, count)  for key, count in class_dictionary.items()}
undersampling_strategy = {key: average_class_count  for key, count in oversampling_strategy.items()}

df_train.head()

Mounted at /content/drive
Dataset original shape: (15819, 3)
Dataset original class distribution: {1: 8530, 2: 3640, 0: 2353, -1: 1296}


Unnamed: 0,message,tweetid,sentiment
6196,RT @Angus_OL: Art\n\nPoliticians discussing wh...,145696,1
1231,RT @WGNWeatherGuy: Home Depot reaps a hurrican...,83124,2
260,@lawlib Hey! We have a new board for bouncing ...,708028,1
8072,RT @Fusion: When it comes to Breitbart News' r...,533025,1
4956,RT @JayMontanaa300: 84 degrees in Atlanta........,168438,1


In [13]:
## FOR XGBOOST ONLY

#from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
#label_encoder = LabelEncoder()

# Fit and transform the string labels to integer labels
#df_train["sentiment"] = label_encoder.fit_transform(df_train["sentiment"])

In [15]:

X_train, X_test, y_train, y_test = train_test_split(df_train['message'], df_train['sentiment'], test_size=0.2, random_state=42)#, stratify=df_train['sentiment'])
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_test: {y_test.shape}')
print(f'Distribution of y_train: {class_distribution(y_train)}')
print(f'Distribution of y_test: {class_distribution(y_test)}')

# Defining preprocessing steps
preprocessing_steps = [

                       ('noise_removal', NoiseRemover()),
                       ('emoticon_convertion', EmoticonConverter()),
                       ('punctuation_removal', PunctuationRemover()),
                       ('tokenization', Tokenizer()),
                       ('stopword_removal', StopwordRemover()),
                       ('lemmatization', Lemmatizer()),
                       ('vectorization', Vectorize(max_df=0.75, min_df=1, ngram_range=(1,1),max_features=None, type='tfidf')),
                       #('oversampler', RandomOverSampler(sampling_strategy=oversampling_strategy, random_state=42)),
                       #('undersampler', RandomUnderSampler(sampling_strategy=undersampling_strategy, random_state=42)),
                       ('smote', SMOTE(k_neighbors=4, random_state=42)),
                       ('scaler', Scaler(type='robust')),
                       #('feature_selection', FeatureSelector())
                       ('feature_selection', FeatureSelector(percentile=99, type='anova_f'))
                       ]


# Create the pipelines
#model = xgb.XGBClassifier()
#model = AdaBoostClassifier()
#model = MultinomialNB()
model = LogisticRegression(C=1, max_iter=1000000,multi_class='multinomial', solver='sag', tol=0.001, penalty='l2', random_state=42)
#model = SVC(C=100,gamma=0.1)  #THIS SEEMS TO BE THE BEST
#model = SVC()
#model = MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', alpha=0.0001,
#                    batch_size='auto', learning_rate='constant', learning_rate_init=0.001,
#                    max_iter=200, random_state=42)

model_name = 'LogisticRegression'
pipeline = Pipeline(preprocessing_steps  + [('model', model)])

# Define parameter grid for GridSearchCV
param_grid = {
    'tokenization__type': ['TweetTokenizdder'],  # 'TreebankWordTokenizer'],
    'vectorization__type': ['tfidf'],  # ,'count'],
    'vectorization__max_df': [0.75],#,0.5],
    'vectorization__min_df': [1],
    'vectorization__ngram_range': [(1, 1)],#, (1, 5)],# (1, 10)],
    'vectorization__max_features': [None],
    'scaler__type': ['robust'],  # 'minmax','maxabs'],
    'feature_selection__type': ['anova_f'],
    #'feature_selection__percentile': [99],


             # Hyperparameters for the model

              #'model__n_neighbors': [3],                                #KNN
              #'model__weights': [ 'distance'],#'uniform',               #KNN
              #'model__metric': ['euclidean']#, 'manhattan'],            #KNN
              #'model__C': [1,10,100],                                  #SVM RBF
              #'model__gamma': [0.001, 0.01, 0.1]                                 #SVM RBF
               'model__penalty': ['l2'],#,'l1'],                            #LogisticRegression
               'model__C': [10],#,100,1,0.001,0.1,],                             #LogisticRegression
               'model__solver': ['liblinear'],#,'sag'],                     #LogisticRegression
               'model__multi_class': ['ovr'],#'multinomial',               #LogisticRegression
               'model__max_iter': [10000],                               #LogisticRegression
               'model__tol': [0.0001],#, 0.0001],                                #LogisticRegression
               'model__n_jobs': [-1],                                           #LogisticRegression
               'model__random_state': [42],                                     #LogisticRegression
               #'model__alpha': [0.01,0.1,1,5,10],                               #Multinomial NB
               #'model__fit_prior': [True, False],                               #Multinomial NB
               #'model__n_estimators': [500],                              #Adaboost
               #'model__learning_rate': [0.1],#, 1, 5],                              #Adaboost
               #'model__random_state': [42],                                     #Adaboost
               #'model__algorithm': ['SAMME'],                                    #Adaboost
               #'model__estimator': [LogisticRegression()]#,SVC(kernel='rbf', C=100,gamma=0.1)],DecisionTreeClassifier(), ],   #Adaboost

                #'model__n_estimators': [100],#200],                               #XGBoost
                #'model__max_depth': [30],#,6],                               #XGBoost
                #'model__learning_rate': [0.3, 1],                               #XGBoost
                #'model__subsample': [1.0],                               #XGBoost
              }

# Create GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro', error_score='raise', verbose=2)
grid_search.fit(X_train, y_train)
df_gridsearch = pd.DataFrame(grid_search.cv_results_)
print(grid_search.best_score_)
print(grid_search.best_params_)

selected_columns = ['Model','Dataset','Timestamp','mean_fit_time','mean_score_time','params','split0_test_score','split1_test_score',
                    'split2_test_score','split3_test_score','split4_test_score',
                    'std_test_score','mean_test_score']

#Create blank results dataframe - only do this once
###results_df = pd.DataFrame(columns=selected_columns)
###results_df.to_csv('/content/drive/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Sprint 6_Advanced Classification/Predict/advanced-classification-predict/notebook/Model results.csv', index=False)
new_data = pd.DataFrame(columns=selected_columns)

new_data[['mean_fit_time','mean_score_time','params','split0_test_score','split1_test_score',
         'split2_test_score','split3_test_score','split4_test_score',
          'std_test_score','mean_test_score']] = df_gridsearch[['mean_fit_time',
         'mean_score_time','params','split0_test_score','split1_test_score',
         'split2_test_score','split3_test_score','split4_test_score',
         'std_test_score','mean_test_score']]
new_data['Model'] = model_name
new_data['Dataset'] = size
new_data['Timestamp']= datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
new_data.to_csv('/content/drive/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Sprint 6_Advanced Classification/Predict/advanced-classification-predict/notebook/Model results.csv', mode='a', header=False, index=False)


# Do a stand-alon pipeline to see the full class report
#pipeline.fit(X_train, y_train)
#y_pred = pipeline.predict(X_test)
#report = classification_report(y_test, y_pred)
#print("Classification Report for test set:")
#print(report)
#print('************************')

#k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
# Perform k-fold cross-validation and generate predictions
#predictions = cross_val_predict(pipeline, X_train, y_train, cv=k_fold)

# Generate classification report
#report = classification_report(y_train, predictions)

# Print the classification report
#print("Classification Report for CV:\n", report)



Shape of X_train: (3163,)
Shape of y_train: (3163,)
Shape of X_test: (791,)
Shape of y_test: (791,)
Distribution of y_train: {1: 1700, 2: 735, 0: 472, -1: 256}
Distribution of y_test: {1: 432, 2: 175, 0: 116, -1: 68}
Fitting 5 folds for each of 1 candidates, totalling 5 fits




[CV] END feature_selection__type=anova_f, model__C=10, model__max_iter=10000, model__multi_class=ovr, model__n_jobs=-1, model__penalty=l2, model__random_state=42, model__solver=liblinear, model__tol=0.0001, scaler__type=robust, tokenization__type=TweetTokenizdder, vectorization__max_df=0.75, vectorization__max_features=None, vectorization__min_df=1, vectorization__ngram_range=(1, 1), vectorization__type=tfidf; total time=   4.3s




[CV] END feature_selection__type=anova_f, model__C=10, model__max_iter=10000, model__multi_class=ovr, model__n_jobs=-1, model__penalty=l2, model__random_state=42, model__solver=liblinear, model__tol=0.0001, scaler__type=robust, tokenization__type=TweetTokenizdder, vectorization__max_df=0.75, vectorization__max_features=None, vectorization__min_df=1, vectorization__ngram_range=(1, 1), vectorization__type=tfidf; total time=   2.0s




[CV] END feature_selection__type=anova_f, model__C=10, model__max_iter=10000, model__multi_class=ovr, model__n_jobs=-1, model__penalty=l2, model__random_state=42, model__solver=liblinear, model__tol=0.0001, scaler__type=robust, tokenization__type=TweetTokenizdder, vectorization__max_df=0.75, vectorization__max_features=None, vectorization__min_df=1, vectorization__ngram_range=(1, 1), vectorization__type=tfidf; total time=   2.0s




[CV] END feature_selection__type=anova_f, model__C=10, model__max_iter=10000, model__multi_class=ovr, model__n_jobs=-1, model__penalty=l2, model__random_state=42, model__solver=liblinear, model__tol=0.0001, scaler__type=robust, tokenization__type=TweetTokenizdder, vectorization__max_df=0.75, vectorization__max_features=None, vectorization__min_df=1, vectorization__ngram_range=(1, 1), vectorization__type=tfidf; total time=   4.4s




[CV] END feature_selection__type=anova_f, model__C=10, model__max_iter=10000, model__multi_class=ovr, model__n_jobs=-1, model__penalty=l2, model__random_state=42, model__solver=liblinear, model__tol=0.0001, scaler__type=robust, tokenization__type=TweetTokenizdder, vectorization__max_df=0.75, vectorization__max_features=None, vectorization__min_df=1, vectorization__ngram_range=(1, 1), vectorization__type=tfidf; total time=   2.0s
0.5940259378404094
{'feature_selection__type': 'anova_f', 'model__C': 10, 'model__max_iter': 10000, 'model__multi_class': 'ovr', 'model__n_jobs': -1, 'model__penalty': 'l2', 'model__random_state': 42, 'model__solver': 'liblinear', 'model__tol': 0.0001, 'scaler__type': 'robust', 'tokenization__type': 'TweetTokenizdder', 'vectorization__max_df': 0.75, 'vectorization__max_features': None, 'vectorization__min_df': 1, 'vectorization__ngram_range': (1, 1), 'vectorization__type': 'tfidf'}




In [None]:
print(grid_search.best_params_)

{'feature_selection__percentile': 50, 'feature_selection__type': 'anova_f', 'model__C': 10, 'model__gamma': 1, 'scaler__type': 'robust', 'tokenization__type': 'TweetTokenizer', 'vectorization__max_features': 10000, 'vectorization__type': 'tfidf'}


In [98]:
scorer = make_scorer(f1_score, average='macro')
f1_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring=scorer)
print("Mean F1 macro score:", f1_scores.mean())
pipeline.fit(X_train, y_train)
model_stats = pipeline.named_steps['model']

# Obtain the number of features used by the model
attributes = model_stats.__dict__

# Print all attributes
for attr, value in attributes.items():
    print(attr, ":", value)


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Mean F1 macro score: 0.27094834786524913




decision_function_shape : ovr
break_ties : False
kernel : rbf
degree : 3
gamma : 1
coef0 : 0.0
tol : 0.001
C : 1
nu : 0.0
epsilon : 0.0
shrinking : True
probability : False
cache_size : 200
class_weight : None
verbose : False
max_iter : -1
random_state : None
_sparse : True
n_features_in_ : 1511
class_weight_ : [1. 1. 1. 1.]
classes_ : [-1  0  1  2]
_gamma : 1
support_ : [  0   1   3   4   6   7   8  10  11  12  13  16  17  18  19  22  23  25
  26  27  28  29  30  31  32  33  36  38  39  42  43  44  45  46  48  49
  50  51  52  53  54  55  56  57  58  59  60  62  63  64  65  66  67  70
  72  73  74  76  77  78  79  81  82  84  85  86  88  89  91  92  93  94
  95  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
 133 134 135 136 137 138 139 140 141 143 144 145 146 147 148 149 150 151
 152 153 154 155 157 158 159 160 161 162 163 164 165 166 167 168 169 170
 171 172 173 174 175 176 177 178 179 180 1

  f = msb / msw


In [64]:
# MODEL COMPARISON
df_train, size, class_dictionary = load_dataset('colab',1)
df_train['sentiment'] = transform_categorical_labels(df_train['sentiment'])

X_train, X_test, y_train, y_test = train_test_split(df_train['message'], df_train['sentiment'], test_size=0.2, random_state=42, stratify=df_train['sentiment'])
preprocessing_steps = [
                       ('noise_removal', NoiseRemover()),
                       ('emoticon_convertion', EmoticonConverter()),
                       ('punctuation_removal', PunctuationRemover()),
                       ('tokenization', Tokenizer(type='TweetTokenizer')),
                       ('stopword_removal', StopwordRemover()),
                       ('lemmatization', Lemmatizer()),
                       ('vectorization', Vectorize()),
                       #('oversampler', RandomOverSampler(sampling_strategy=oversampling_strategy, random_state=42)),
                       #('undersampler', RandomUnderSampler(sampling_strategy=undersampling_strategy, random_state=42)),
                       ('smote', SMOTE(k_neighbors=3, random_state=42)),
                       ('scaler', Scaler(type='robust')),
                       ('feature_selection', FeatureSelector(percentile=60, type='anova_f'))
                       ]
models =        {
                 'Multinomial Naive Bayes': MultinomialNB(),
                 'Logistic Regression': LogisticRegression(max_iter=1000),
                 'KNN': KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree', leaf_size=30, metric='minkowski', p=2, weights='distance'),
                 'SVC - linear': SVC(kernel="linear", C=0.025),
                 #'SVC - RBF': SVC(gamma=1, C=1),
                 'DecisionTree': DecisionTreeClassifier(max_depth=5),
                 'RandomForest': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=42),
                 'AdaBoost': AdaBoostClassifier(random_state=42, n_estimators=200),
                }

# Create the pipelines

for name, model in models.items():
    pipeline = Pipeline(preprocessing_steps  + [('model', model)])
    param_grid = {
    'tokenization__type': ['TweetTokenizer'],  # 'TreebankWordTokenizer'],
    'vectorization__type': ['tfidf'],  # ,'count'],
    'vectorization__max_df': [0.75],
    'vectorization__min_df': [1],
    'vectorization__ngram_range': [(1, 1)],#, (1, 5), (1, 10)],
    'vectorization__max_features': [None],
    'scaler__type': ['robust'],  # 'minmax','maxabs'],
    'feature_selection__type': ['anova_f'],
    'feature_selection__percentile': [60],
                }

    # Create GridSearchCV object
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro', error_score='raise', verbose=2)
    grid_search.fit(X_train, y_train)
    df_gridsearch = pd.DataFrame(grid_search.cv_results_)
    #Create blank results dataframe - only do this once
    selected_columns = ['Model','Dataset','Timestamp','mean_fit_time','mean_score_time','params','split0_test_score','split1_test_score','split2_test_score','split3_test_score','split4_test_score','std_test_score','mean_test_score']
    #results_df = pd.DataFrame(columns=selected_columns)
    #results_df.to_csv('/content/drive/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Sprint 6_Advanced Classification/Predict/advanced-classification-predict/notebook/Model_Comparison results.csv', index=False)
    new_data = pd.DataFrame(columns=selected_columns)
    new_data[['mean_fit_time','mean_score_time','params','split0_test_score','split1_test_score',
         'split2_test_score','split3_test_score','split4_test_score','std_test_score','mean_test_score']] = df_gridsearch[['mean_fit_time',
         'mean_score_time','params','split0_test_score','split1_test_score','split2_test_score','split3_test_score','split4_test_score','std_test_score','mean_test_score']]
    new_data['Model'] = name
    new_data['Dataset'] = size
    new_data['Timestamp']= datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    new_data.to_csv('/content/drive/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Sprint 6_Advanced Classification/Predict/advanced-classification-predict/notebook/Model_Comparison results.csv', mode='a', header=False, index=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset original shape: (15819, 3)
Dataset original class distribution: {1: 8530, 2: 3640, 0: 2353, -1: 1296}
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END feature_selection__percentile=60, feature_selection__type=anova_f, scaler__type=robust, tokenization__type=TweetTokenizer, vectorization__max_df=0.75, vectorization__max_features=None, vectorization__min_df=1, vectorization__ngram_range=(1, 1), vectorization__type=tfidf; total time=   7.2s
[CV] END feature_selection__percentile=60, feature_selection__type=anova_f, scaler__type=robust, tokenization__type=TweetTokenizer, vectorization__max_df=0.75, vectorization__max_features=None, vectorization__min_df=1, vectorization__ngram_range=(1, 1), vectorization__type=tfidf; total time=   8.0s
[CV] END feature_selection__percentile=60, feature_selection__type=anova_f, scaler__type=robust, token

KeyboardInterrupt: 

In [None]:
'''
# Evaluate the pipeline
accuracy = pipeline.score(X_test, y_test)
print("Accuracy:", accuracy)

model_stats = pipeline.named_steps['model']

# Obtain the number of features used by the model
attributes = model_stats.__dict__

# Print all attributes
for attr, value in attributes.items():
    print(attr, ":", value)
'''

In [None]:
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f'G:/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Sprint 6_Advanced Classification/Predict/advanced-classification-predict/notebook/dawieloots_predict_gridsearch_{timestamp}.csv'
df_gridsearch.to_csv(filename, index=False)

In [None]:

print(df_gridsearch)
print(grid_search.get_params)

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

from imblearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

# Custom transformer to wrap SMOTE

class ResampleAndFeatureSelectTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, k=1000, score_func=chi2, k_neighbors=5):
        self.k = k
        self.score_func = score_func
        self.k_neighbors = k_neighbors
        self.feature_selector = SelectKBest(score_func=self.score_func, k=self.k)
        
    def fit(self, X, y):
        # Print the shape of X_train before preprocessing
               
        # Select features from the resampled data
        self.feature_selector.fit(X, y)
        
        # Print the shape of X_train after preprocessing
        
        return self

    def transform(self, X):
        # Select features from the input data
        X_selected = self.feature_selector.transform(X)
        return X_selected

# Load the 20 Newsgroups dataset
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
data_subset = {
    'data': data.data[:1000],
    'target': data.target[:1000],
    'target_names': data.target_names
}

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_subset['data'], data_subset['target'], test_size=0.2, random_state=42)

# Define preprocessing steps
preprocessing_steps = [
    # Text preprocessing
    ('vectorizer', TfidfVectorizer()),  # Convert text data into numerical vectors
    ('smote', SMOTE()),
    ('resample_and_feature_select', ResampleAndFeatureSelectTransformer(k=1000, score_func=chi2)),  # Resample and select top k features
    
]

# Define the model
model = KNeighborsClassifier()

# Create the pipeline
pipeline = Pipeline(preprocessing_steps + [('model', model)])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Evaluate the pipeline
accuracy = pipeline.score(X_test, y_test)
print("Accuracy:", accuracy)

model = pipeline.named_steps['model']

# Obtain the number of features used by the model
attributes = model.__dict__

# Print all attributes
for attr, value in attributes.items():
    print(attr, ":", value)
#n_features = model.n_features_
#print("Number of features used by the model:", n_features)


In [16]:
## Final model building with full trainset
df_train, _, _ = load_dataset('colab',1)
#df_train['sentiment'] = transform_categorical_labels(df_train['sentiment'])
X_train = df_train['message']
y_train = df_train['sentiment']

preprocessing_steps = [
                       ('noise_removal', NoiseRemover()),
                       ('emoticon_convertion', EmoticonConverter()),
                       ('punctuation_removal', PunctuationRemover()),
                       ('tokenization', Tokenizer(type='TweetTokenizer')),
                       ('stopword_removal', StopwordRemover()),
                       ('lemmatization', Lemmatizer()),
                       ('vectorization', Vectorize(max_df=0.75, min_df=1, ngram_range=(1,1),max_features=None, type='tfidf')),
                       ('smote', SMOTE(k_neighbors=4, random_state=42)),
                       ('scaler', Scaler(type='robust')),
                       ('feature_selection', FeatureSelector(percentile=99, type='anova_f'))
                       ]

#model = SVC(C=10,gamma=0.01)  #THIS SEEMS TO BE THE BEST
model = LogisticRegression(C=1, max_iter=1000000,multi_class='multinomial', solver='sag', tol=0.001, penalty='l2', random_state=42)
pipeline = Pipeline(preprocessing_steps  + [('model', model)])
pipeline.fit(X_train, y_train)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset original shape: (15819, 3)
Dataset original class distribution: {1: 8530, 2: 3640, 0: 2353, -1: 1296}


In [17]:
model_stats = pipeline.named_steps['model']

# Obtain the number of features used by the model
attributes = model_stats.__dict__

# Print all attributes
for attr, value in attributes.items():
    print(attr, ":", value)

penalty : l2
dual : False
tol : 0.001
C : 1
fit_intercept : True
intercept_scaling : 1
class_weight : None
random_state : 42
solver : sag
max_iter : 1000000
multi_class : multinomial
verbose : 0
warm_start : False
n_jobs : None
l1_ratio : None
n_features_in_ : 22481
classes_ : [-1  0  1  2]
n_iter_ : [55]
coef_ : [[-0.09849407 -0.06170661 -0.0242912  ... -0.0316989  -0.0316989
  -0.04595831]
 [ 0.47132916 -0.01673616 -0.0268516  ...  0.17909438  0.17909438
   0.18485926]
 [-0.11636423  0.17776388  0.06536473 ... -0.0932192  -0.0932192
  -0.0767753 ]
 [-0.25647085 -0.09932111 -0.01422193 ... -0.05417628 -0.05417628
  -0.06212565]]
intercept_ : [-0.73788254  1.18096025 -0.08638792 -0.35668979]


In [19]:
test_file = '/content/drive/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Sprint 6_Advanced Classification/Predict/advanced-classification-predict/data/test_with_no_labels.csv'
df_test = pd.read_csv(test_file)
X_unseen = df_test['message']

# Predict
y_pred = pipeline.predict(X_unseen)
label_map = {
        'News': 2,
        'Pro': 1,
        'Neutral': 0,
        'Anti': -1
    }
#y_pred_int = [label_map[label] for label in y_pred_text]

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f'/content/drive/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Sprint 6_Advanced Classification/Predict/advanced-classification-predict/notebook/dawieloots_predict_{timestamp}.csv'
submission_df = pd.DataFrame()
submission_df['tweetid'] = df_test.tweetid
submission_df['sentiment'] = y_pred
submission_df.to_csv(filename, index=False)