In [2]:
# Libraries for data loading, data manipulation and data visulisation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import chardet # To provide a best estimate of the encoding that was used in the text data
import io # For string operations
%matplotlib inline

# Libraries for data preparation and model building
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize, TweetTokenizer, TreebankWordTokenizer
from nltk.corpus import stopwords
import string
import datetime
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import math
import re
from sklearn.utils import resample
from scipy.sparse import vstack
from scipy.sparse import csr_matrix
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn import feature_selection
from sklearn.feature_selection import SelectPercentile, SelectKBest, f_classif, mutual_info_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, log_loss
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE,SMOTENC
from imblearn.pipeline import Pipeline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dawie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dawie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# LOAD DATASET

def class_distribution(data):
    if isinstance(data, pd.DataFrame):
        unique_classes, class_counts = data.iloc[:, 0].value_counts().index, data.iloc[:, 0].value_counts().values
    elif isinstance(data, pd.Series):
        unique_classes, class_counts = data.value_counts().index, data.value_counts().values
    class_dict = {}
    for class_name, count in zip(unique_classes, class_counts):
        class_perc = round(count/len(data),3)
        class_dict.update({class_name: class_perc})
    return class_dict

def load_dataset(environment, size):
    if environment == 'colab':
        from google.colab import drive
        drive.mount('/content/drive')
        csv_file = '/content/drive/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Sprint 6_Advanced Classification/Predict/advanced-classification-predict/data/train.csv'
    else:
        csv_file = r'G:\My Drive\Professionele ontwikkeling\Data Science\Explore Data Science Course\Sprint 6_Advanced Classification\Predict\advanced-classification-predict\data\train.csv'
    df = pd.read_csv(csv_file)
    print(f'Dataset original shape: {df.shape}')
    print(f'Dataset original class distribution: {class_distribution(df)}')
    sample_size = int(len(df) * size)
    if size == 1:
        pass
    else:
        X = df.drop(columns=['sentiment']).copy()
        y = df.sentiment.copy()
        X_sample, _, y_sample, _ = train_test_split(X, y, train_size=sample_size, stratify=y, random_state=42)
        df = pd.concat([X_sample, y_sample], axis=1)

#        np.random.seed(42)  # OLD CODE
#       pd.read_csv(csv_file).shape[0] - n  # OLD CODE
#        df = pd.read_csv(csv_file, skiprows=lambda i: i > 0 and np.random.rand() > n / (i + 1), nrows=n)  # OLD CODE
    return df, sample_size


# def load_full_dataset():  # OLD CODE
#   df = pd.read_csv(csv_file)  # OLD CODE
#   return df, 'full'  # OLD CODE


df_train = pd.read_csv('G:/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Sprint 6_Advanced Classification/Predict/advanced-classification-predict/data/train.csv')
pd.set_option('display.max_colwidth', None)
df_train.head(10)

In [4]:
class NoiseRemover(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y):
        return self

    def transform(self, X, y=None):
        pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'   # Find all hyperlinks
        subs_url = r''
        X_transformed = pd.Series(X).replace(to_replace = pattern_url, value = subs_url, regex = True)
        return X_transformed

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

In [5]:
class EmoticonConverter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y):
        return self

    def transform(self, X, y=None):
        emoticon_dictionary = {':\)': 'smiley_face_emoticon',
                               ':\(': 'frowning_face_emoticon',
                               ':D': 'grinning_face_emoticon',
                               ':P': 'sticking_out_tongue_emoticon',
                               ';\)': 'winking_face_emoticon',
                               ':o': 'surprised_face_emoticon',
                               ':\|': 'neutral_face_emoticon',
                               ':\'\)': 'tears_of_joy_emoticon',
                               ':\'\(': 'crying_face_emoticon'}
        X_transformed = X.replace(emoticon_dictionary, regex=True)
        return X_transformed

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

In [6]:
class PunctuationRemover(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y):
        return self

    def transform(self, X, y=None):
        def expand_contractions(text):
            contractions = {"'t": " not","'s": " is","'re": " are","'ll": " will", "'m": " am"}
            pattern = re.compile(r"\b(" + "|".join(re.escape(key) for key in contractions.keys()) + r")\b")
            text = re.sub(r"n't\b", " not", text) # Replace "n't" with " not"
            text = pattern.sub(lambda match: contractions[match.group(0)], text) # Replace all other contractions except for "n't"
            return text

        def remove_punctuation(text):
            return ''.join([l for l in text if l not in string.punctuation])

        X_transformed = X.apply(lambda x: expand_contractions(x))
        X_transformed = X_transformed.apply(lambda x: remove_punctuation(x))
        return X_transformed

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

In [7]:
class Tokenizer(BaseEstimator, TransformerMixin):
    def __init__(self, type='TweetTokenizer'):
        self.type = type

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if self.type == 'TweetTokenizer':
            tokenizer = TweetTokenizer()
        else:
            tokenizer = TreebankWordTokenizer()
        X_transformed = X.apply(tokenizer.tokenize)
        return X_transformed

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

In [8]:
class StopwordRemover(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        stop_words = set(stopwords.words('english'))
        # Remove stopwords using a vectorized operation
        X_transformed = X.apply(lambda tokens: [t for t in tokens if t.lower() not in stop_words])
        return X_transformed

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

In [9]:
class Lemmatizer(BaseEstimator, TransformerMixin):
    def __init__(self, pos='v'):
        self.pos = pos

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        lemmatizer = WordNetLemmatizer()
        X_transformed = X.apply(lambda tokens: [lemmatizer.lemmatize(word, pos=self.pos) for word in tokens])
        return X_transformed

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

In [10]:
class RandomSampler(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
        

In [11]:
class Vectorize(BaseEstimator, TransformerMixin):
    def __init__(self, type='tfidf', max_df=1, min_df=1, ngram_range=(1,1), max_features=None):
        self.type = type
        self.max_df = max_df
        self.min_df = min_df
        self.ngram_range = ngram_range
        self.max_features = max_features

        if self.type == 'count':
            self.vectorizer = CountVectorizer(max_features=self.max_features, lowercase=True,
                                              max_df=self.max_df, min_df=self.min_df,
                                              ngram_range=self.ngram_range)
        elif self.type == 'tfidf':
            self.vectorizer = TfidfVectorizer(max_features=self.max_features, lowercase=True,
                                              max_df=self.max_df, min_df=self.min_df,
                                              ngram_range=self.ngram_range)
        else:
            raise ValueError("Invalid vectorizer type. Choose either 'count' or 'tfidf'")

    def fit(self, X, y=None):
        X_joined = [' '.join(tokens) for tokens in X]
        self.vectorizer.fit(X_joined)
        return self.vectorizer.fit(X_joined)

    def transform(self, X, y=None):
        X_joined = [' '.join(tokens) for tokens in X]
        return self.vectorizer.transform(X_joined)

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

In [12]:
class Scaler(BaseEstimator, TransformerMixin):
    def __init__(self, type='robust'):
        self.type = type

        if self.type == 'robust':
            self.scaler = RobustScaler(with_centering=False)
        elif self.type == 'minmax':
            self.scaler = MinMaxScaler(with_centering=False)
        elif self.type == 'maxabs':
            self.scaler = MaxAbsScaler(with_centering=False)
        else:
            raise ValueError("Invalid scaler type. Choose between 'robust', 'minmax' or 'maxabs'.")

    def fit(self, X, y=None):
        self.scaler.fit(X)
        return self

    def transform(self, X, y=None):
        return self.scaler.transform(X)

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

In [13]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, type='anova_f', percentile=50):
        self.type = type
        self.percentile = percentile

        if self.type == 'mutualinfo':
            self.selector = SelectPercentile(score_func=mutual_info_classif, percentile=percentile)
        elif self.type == 'anova_f':
            self.selector = SelectPercentile(score_func=f_classif, percentile=percentile)
        else:
            raise ValueError("Invalid selector type. Choose between 'mutualinfo' or 'anova_f'.")

    def fit(self, X, y=None):
        self.selector.fit(X, y)
        return self

    def transform(self, X, y=None):
        return self.selector.transform(X)

    def fit_transform(self, X, y=None):
        self.selector.fit(X, y)
        return self.selector.transform(X)

In [71]:
def transform_categorical_labels(data):
    # Dictionary mapping numerical categories to labels
    label_map = {
        2: 'News',
        1: 'Pro',
        0: 'Neutral',
        -1: 'Anti'
    }
    transformed_data = data.map(label_map)
    return transformed_data

In [None]:
# RUN GRIDSEARCH ON ONE SPECIFIC MODEL

df_train, size = load_dataset('jupyter',0.1)
df_train['sentiment'] = transform_categorical_labels(df_train['sentiment'])
X_train, X_test, y_train, y_test = train_test_split(df_train['message'], df_train['sentiment'], test_size=0.2, random_state=42, stratify=df_train['sentiment'])

print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_test: {y_test.shape}')
print(f'Distribution of y_train: {class_distribution(y_train)}')
print(f'Distribution of y_test: {class_distribution(y_test)}')

# Defining preprocessing steps
preprocessing_steps = [
                       ('noise_removal', NoiseRemover()),
                       ('emoticon_convertion', EmoticonConverter()),
                       ('punctuation_removal', PunctuationRemover()),
                       ('tokenization', Tokenizer(type='TweetTokenizer')),
                       ('stopword_removal', StopwordRemover()),
                       ('lemmatization', Lemmatizer()),
                       ('vectorization', Vectorize(max_features=2000, type='tfidf')),
                       ('smote', SMOTE()),
                       ('scaler', Scaler(type='robust')),
                       ('feature_selection', FeatureSelector(percentile=50, type='mutualinfo'))
                       ]

# Create the pipeline
from sklearn.neural_network import MLPClassifier
model = MLPClassifier()
model_name = 'MLP'
pipeline = Pipeline(preprocessing_steps  + [('model', model)])

# Define parameter grid for GridSearchCV
param_grid = {'tokenization__type': ['TweetTokenizer'],#'TreebankWordTokenizer'],
              'vectorization__type': ['tfidf'],
              'vectorization__max_df': [0.75],
              'vectorization__min_df': [1],#10],
              'vectorization__ngram_range': [(1,1)],
              'vectorization__max_features': [None],
              'scaler__type': ['robust'],#'minmax','maxabs'],
              'feature_selection__type': ['anova_f'],
              'feature_selection__percentile': [60],

            # Hyperparameters for the model

              #'model__n_neighbors': [3],                               #KNN
              #'model__weights': [ 'distance'],#'uniform',              #KNN
              #'model__metric': ['euclidean']#, 'manhattan'],           #KNN
              #'model__penalty': ['l1','l2','elasticnet',None],         #LogisticRegression
              #'model__C': [0.01, 0.1 , 1, 2],                           #LogisticRegression
              #'model__solver': ['liblinear', 'sag'],                   #LogisticRegression
              #'model__max_iter': [5000],                                #LogisticRegression
              #'model__multi_class': ['ovr', 'multinomial'],             #LogisticRegression
              #'model__random_state': [42],                              #LogisticRegression
              #'model__C': [1],                                    #SVM RBF
              #'model__gamma': [1]                                      #SVM RBF
               'model__hidden_layer_sizes': [(100, ),(100,50)], 
               'model__activation': ['relu'],
               'model__solver': ['adam'], 
               'model__random_state': [42]
}

# Create GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro', error_score='raise', verbose=2)
grid_search.fit(X_train, y_train)
df_gridsearch = pd.DataFrame(grid_search.cv_results_)
print(grid_search.best_score_)
print(grid_search.best_params_)


#Create blank results dataframe - only do this once
#selected_columns = ['Model','Dataset','Timestamp','mean_fit_time','mean_score_time','params','split0_test_score','split1_test_score','split2_test_score','split3_test_score','split4_test_score','std_test_score','mean_test_score']
###results_df = pd.DataFrame(columns=selected_columns)
###results_df.to_csv('/content/drive/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Sprint 6_Advanced Classification/Predict/advanced-classification-predict/notebook/Model results.csv', index=False)
#new_data = pd.DataFrame(columns=selected_columns)
#new_data[['mean_fit_time','mean_score_time','params','split0_test_score','split1_test_score',
#         'split2_test_score','split3_test_score','split4_test_score','std_test_score','mean_test_score']] = df_gridsearch[['mean_fit_time',
#         'mean_score_time','params','split0_test_score','split1_test_score','split2_test_score','split3_test_score','split4_test_score','std_test_score','mean_test_score']]
#new_data['Model'] = model_name
#new_data['Dataset'] = size
#new_data['Timestamp']= datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
#new_data.to_csv('/content/drive/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Sprint 6_Advanced Classification/Predict/advanced-classification-predict/notebook/Model results.csv', mode='a', header=False, index=False)
###filename = f'/content/drive/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Sprint 6_Advanced Classification/Predict/advanced-classification-predict/notebook/results_{timestamp}.xlsx'
###df_gridsearch.to_excel(filename, index=False)


In [None]:
# FIT ONE MODEL TO CHECK AGAINST GRIDSEARCH
df_train, size = load_dataset('jupyter',1)
df_train['sentiment'] = transform_categorical_labels(df_train['sentiment'])

X_train, X_test, y_train, y_test = train_test_split(df_train['message'], df_train['sentiment'], test_size=0.2, random_state=42, stratify=df_train['sentiment'])

preprocessing_steps = [
                       ('noise_removal', NoiseRemover()),
                       ('emoticon_convertion', EmoticonConverter()),
                       ('punctuation_removal', PunctuationRemover()),
                       ('tokenization', Tokenizer(type='TweetTokenizer')),
                       ('stopword_removal', StopwordRemover()),
                       ('lemmatization', Lemmatizer()),
                       ('vectorization', Vectorize(max_features=20000,max_df=0.6,ngram_range=(1,15), type='tfidf')),
                       #('randomoversampler', RandomOverSampler(sampling_strategy='auto')),
                       #('randomundersampler', RandomUnderSampler(sampling_strategy='auto')),
                       ('smote', SMOTE()),
                       ('scaler', Scaler(type='robust')),
                       ('feature_selection', FeatureSelector(percentile=2, type='anova_f'))
                       ]
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(10000,), 
                     activation='relu',
                     solver='adam', 
                     random_state=42,
                     max_iter=10000)
pipeline = Pipeline(preprocessing_steps + [('model', model)])
#scorer = make_scorer(f1_score, average='macro')
#f1_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring=scorer)
#print("Mean F1 macro score:", f1_scores.mean())
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)
#model_stats = pipeline.named_steps['model']

# Obtain the number of features used by the model
#attributes = model_stats.__dict__
#shape = attributes['shape_fit_']
#cw = attributes['class_weight_']
#print(f'Shape: {shape}')
#print(f'class weight: {cw}')

# Print all attributes
#for attr, value in attributes.items():
#    print(attr, ":", value)


Dataset original shape: (15819, 3)
Dataset original class distribution: {1: 0.539, 2: 0.23, 0: 0.149, -1: 0.082}


In [64]:
df_train, size = load_dataset('jupyter',0.2)

X_train, X_test, y_train, y_test = train_test_split(df_train['message'], df_train['sentiment'], test_size=0.2, random_state=42, stratify=df_train['sentiment'])

'''
## CUSTOM CLASS RESAMPLING
df_train = pd.concat([X_train, y_train], axis=1)
class_min1 = df_train[df_train['sentiment']==-1]
class_0 = df_train[df_train['sentiment']==0]
class_1 = df_train[df_train['sentiment']==1]
class_2 = df_train[df_train['sentiment']==2]
balance = len(df_train) // 4 # The number of samples that will result in class balance
df_train_class1_resampled = resample(class_1,
                            replace=False, # sample without replacement (no need to duplicate observations)
                            n_samples=balance, # make all classes equal
                            random_state=27) # reproducible results
df_train_classmin1_resampled = resample(class_min1,
                            replace=True, # sample with replacement (we need to duplicate observations)
                            n_samples=balance, # make all classes equal
                            random_state=27) # reproducible results
df_train_class0_resampled = resample(class_0,
                            replace=True, # sample with replacement (we need to duplicate observations)
                            n_samples=balance, # make all classes equal
                            random_state=27) # reproducible results
df_train_class2_resampled = resample(class_2,
                            replace=True, # sample with replacement (we need to duplicate observations)
                            n_samples=balance, # make all classes equal
                            random_state=27) # reproducible results

df_train.reset_index(drop=True, inplace=True) # Reset index before upsampling
df_train = pd.concat([df_train_class1_resampled, df_train_classmin1_resampled,
                                df_train_class0_resampled, df_train_class2_resampled])
df_train.set_index(df_train.index, inplace=True) # Set the default integer index as the new index after upsampling

# Check new class counts
print(df_train['sentiment'].value_counts())
X_train = df_train['message'].squeeze()
y_train = df_train['sentiment'].squeeze()
'''

noise_removal = NoiseRemover()
emoticon_convertion = EmoticonConverter()
punctuation_removal = PunctuationRemover()
tokenization = Tokenizer(type='TweetTokenizer')
stopword_removal = StopwordRemover()
lemmatization = Lemmatizer()
vectorization = Vectorize(max_df=0.01,ngram_range = (1,1), max_features=None, type='tfidf')
smote = SMOTE()
scaler = Scaler(type='robust')
feature_selection = FeatureSelector(percentile=60, type='anova_f')
X_train = noise_removal.fit_transform(X_train)
X_train = emoticon_convertion.fit_transform(X_train)
X_train = punctuation_removal.fit_transform(X_train)
X_train = tokenization.fit_transform(X_train)
X_train = stopword_removal.fit_transform(X_train)
X_train = lemmatization.fit_transform(X_train)
X_train = vectorization.fit_transform(X_train)
print(X_train.shape)
"""
X_train, y_train = smote.fit_resample(X_train, y_train)
X_train = scaler.fit_transform(X_train)
X_train = feature_selection.fit_transform(X_train, y_train)


X_test = noise_removal.transform(X_test)
X_test = emoticon_convertion.transform(X_test)
X_test = punctuation_removal.transform(X_test)
X_test = tokenization.transform(X_test)
X_test = stopword_removal.transform(X_test)
X_test = lemmatization.transform(X_test)
X_test = vectorization.transform(X_test)
X_test = scaler.transform(X_test)
X_test = feature_selection.transform(X_test, y_train)

model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_macro = f1_score(y_test, y_pred, average='macro')
print("F1 Macro Score:", f1_macro)

# Obtain the number of features used by the model
attributes = model.__dict__
#shape = attributes['shape_fit_']
#cw = attributes['class_weight_']
#print(f'Shape: {shape}')
#print(f'class weight: {cw}')

# Print all attributes
#for attr, value in attributes.items():
#    print(attr, ":", value)
"""

Dataset original shape: (15819, 3)
Dataset original class distribution: {1: 0.539, 2: 0.23, 0: 0.149, -1: 0.082}
(12655, 19549)


'\nX_train, y_train = smote.fit_resample(X_train, y_train)\nX_train = scaler.fit_transform(X_train)\nX_train = feature_selection.fit_transform(X_train, y_train)\n\n\nX_test = noise_removal.transform(X_test)\nX_test = emoticon_convertion.transform(X_test)\nX_test = punctuation_removal.transform(X_test)\nX_test = tokenization.transform(X_test)\nX_test = stopword_removal.transform(X_test)\nX_test = lemmatization.transform(X_test)\nX_test = vectorization.transform(X_test)\nX_test = scaler.transform(X_test)\nX_test = feature_selection.transform(X_test, y_train)\n\nmodel = KNeighborsClassifier(n_neighbors=3)\nmodel.fit(X_train, y_train)\ny_pred = model.predict(X_test)\nf1_macro = f1_score(y_test, y_pred, average=\'macro\')\nprint("F1 Macro Score:", f1_macro)\n\n# Obtain the number of features used by the model\nattributes = model.__dict__\n#shape = attributes[\'shape_fit_\']\n#cw = attributes[\'class_weight_\']\n#print(f\'Shape: {shape}\')\n#print(f\'class weight: {cw}\')\n\n# Print all attr

In [None]:
# MODEL COMPARISON
df_train, size = load_dataset(0.1)
X_train, X_test, y_train, y_test = train_test_split(df_train['message'], df_train['sentiment'], test_size=0.2, random_state=42, stratify=df_train['sentiment'])
preprocessing_steps = [
                       ('noise_removal', NoiseRemover()),
                       ('emoticon_convertion', EmoticonConverter()),
                       ('punctuation_removal', PunctuationRemover()),
                       ('tokenization', Tokenizer()),
                       ('stopword_removal', StopwordRemover()),
                       ('lemmatization', Lemmatizer()),
                       ('vectorization', Vectorize()),
                       ('smote', SMOTE()),
                       ('scaler', Scaler()),
                       ('feature_selection', FeatureSelector())
                       ]

models =        {
                 'Multinomial Naive Bayes': MultinomialNB(),
                 'Logistic Regression': LogisticRegression(max_iter=1000),
                 'KNN': KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree', leaf_size=30, metric='minkowski', p=2, weights='distance'),
                 'SVC - linear': SVC(kernel="linear", C=0.025),
                 'SVC - RBF': SVC(gamma=1, C=1),
                 'DecisionTree': DecisionTreeClassifier(max_depth=5),
                 'RandomForest': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=42),
                 'AdaBoost': AdaBoostClassifier(random_state=42, n_estimators=200),

                }

# Create the pipelines

for name, model in models.items():
    pipeline = Pipeline(preprocessing_steps  + [('model', model)])
    param_grid = {
              'tokenization__type': ['TweetTokenizer'],#'TreebankWordTokenizer'],
              'vectorization__type': ['tfidf','count'],
              'vectorization__max_df': [0.3, 0.5,0.75],
              'vectorization__min_df': [2,10],
              'vectorization__ngram_range': [(1,2),(1,5),(1,7)],
              'vectorization__max_features': [5000,10000],
              'scaler__type': ['robust'],#'minmax','maxabs'],
              'feature_selection__type': ['anova_f','mutualinfo'],
              'feature_selection__percentile': [75,50],
                }

    # Create GridSearchCV object
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro', error_score='raise', verbose=2)
    grid_search.fit(X_train, y_train)
    df_gridsearch = pd.DataFrame(grid_search.cv_results_)
    #Create blank results dataframe - only do this once
    selected_columns = ['Model','Dataset','Timestamp','mean_fit_time','mean_score_time','params','split0_test_score','split1_test_score','split2_test_score','split3_test_score','split4_test_score','std_test_score','mean_test_score']
    #results_df = pd.DataFrame(columns=selected_columns)
    #results_df.to_csv('/content/drive/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Sprint 6_Advanced Classification/Predict/advanced-classification-predict/notebook/Model_Comparison results.csv', index=False)
    new_data = pd.DataFrame(columns=selected_columns)
    new_data[['mean_fit_time','mean_score_time','params','split0_test_score','split1_test_score',
         'split2_test_score','split3_test_score','split4_test_score','std_test_score','mean_test_score']] = df_gridsearch[['mean_fit_time',
         'mean_score_time','params','split0_test_score','split1_test_score','split2_test_score','split3_test_score','split4_test_score','std_test_score','mean_test_score']]
    new_data['Model'] = model_name
    new_data['Dataset'] = size
    new_data['Timestamp']= datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    new_data.to_csv('/content/drive/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Sprint 6_Advanced Classification/Predict/advanced-classification-predict/notebook/Model_Comparison results.csv', mode='a', header=False, index=False)

In [None]:
'''
# Evaluate the pipeline
accuracy = pipeline.score(X_test, y_test)
print("Accuracy:", accuracy)

model_stats = pipeline.named_steps['model']

# Obtain the number of features used by the model
attributes = model_stats.__dict__

# Print all attributes
for attr, value in attributes.items():
    print(attr, ":", value)
'''

In [None]:
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f'G:/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Sprint 6_Advanced Classification/Predict/advanced-classification-predict/notebook/dawieloots_predict_gridsearch_{timestamp}.csv'
df_gridsearch.to_csv(filename, index=False)

In [None]:

print(df_gridsearch)
print(grid_search.get_params)

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

from imblearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

# Custom transformer to wrap SMOTE

class ResampleAndFeatureSelectTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, k=1000, score_func=chi2, k_neighbors=5):
        self.k = k
        self.score_func = score_func
        self.k_neighbors = k_neighbors
        self.feature_selector = SelectKBest(score_func=self.score_func, k=self.k)
        
    def fit(self, X, y):
        # Print the shape of X_train before preprocessing
               
        # Select features from the resampled data
        self.feature_selector.fit(X, y)
        
        # Print the shape of X_train after preprocessing
        
        return self

    def transform(self, X):
        # Select features from the input data
        X_selected = self.feature_selector.transform(X)
        return X_selected

# Load the 20 Newsgroups dataset
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
data_subset = {
    'data': data.data[:1000],
    'target': data.target[:1000],
    'target_names': data.target_names
}

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_subset['data'], data_subset['target'], test_size=0.2, random_state=42)

# Define preprocessing steps
preprocessing_steps = [
    # Text preprocessing
    ('vectorizer', TfidfVectorizer()),  # Convert text data into numerical vectors
    ('smote', SMOTE()),
    ('resample_and_feature_select', ResampleAndFeatureSelectTransformer(k=1000, score_func=chi2)),  # Resample and select top k features
    
]

# Define the model
model = KNeighborsClassifier()

# Create the pipeline
pipeline = Pipeline(preprocessing_steps + [('model', model)])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Evaluate the pipeline
accuracy = pipeline.score(X_test, y_test)
print("Accuracy:", accuracy)

model = pipeline.named_steps['model']

# Obtain the number of features used by the model
attributes = model.__dict__

# Print all attributes
for attr, value in attributes.items():
    print(attr, ":", value)
#n_features = model.n_features_
#print("Number of features used by the model:", n_features)
