In [1]:
import pandas as pd
import numpy as np
from sklearn import set_config
from sklearn import metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.utils import shuffle
import seaborn as sns
import matplotlib.pyplot as plt
import random
from collections import Counter

In [2]:
def plot_coefficients(coefs, feature_names, top_features):
    top_positive_coefficients = np.argsort(coefs)[-top_features:]
    top_negative_coefficients = np.argsort(coefs)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
    # create plot
    plt.figure(figsize=(15, 5))
    colors = ['red' if c < 0 else 'blue' for c in coefs[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coefs[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(0, 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right')
    plt.title("Meilleurs coefficients pour la classification binaire")
    plt.savefig('img/coefs_linear_svm.png')
    plt.show()

In [3]:
def canonizer(data, test_size=0.2, random_state=42, sampling=None, cross_validation=False, cv=5, kernel='rbf', nb_coef=20):
    
    df_results = pd.DataFrame()
    
    if cross_validation == True:
        
        pipe = make_pipeline(StandardScaler(), SVC(kernel=kernel, probability=True))
        
        #cv_results = cross_validate(pipe, data.drop(['canon'], axis=1), data['canon'], cv=cv, scoring='precision')
        cv_results = cross_val_score(pipe, data.drop(['gender'], axis=1), data['gender'], cv=cv, scoring=make_scorer(classification_report_with_accuracy_score))
        return pipe, cv_results
    
    else:
        if kernel == 'rbf':
            pipe = make_pipeline(StandardScaler(), Normalizer(), SVC(kernel=kernel, probability=True))
        else:
            pipe = make_pipeline(StandardScaler(), SVC(kernel=kernel, probability=True))
    

        X_train, X_test, y_train, y_test = train_test_split(data.drop(['gender'], axis=1), data['gender'], test_size=test_size, random_state=random_state)
        print('Original dataset shape {}'.format(Counter(y_train)))
        
    pipe.fit(X_train, y_train)
        
    report = metrics.classification_report(y_test, pipe.predict(X_test), output_dict=True)
    df_scores = pd.DataFrame(report).transpose()
    print(df_scores)

    _ = ConfusionMatrixDisplay.from_estimator(pipe, X_test, y_test)
                   
    df_results['metadata'] = y_test
    df_results['proba male'] = pipe.predict_proba(X_test)[:,0]
    df_results['proba female'] = pipe.predict_proba(X_test)[:,1]
    df_results['prediction']= pipe.predict(X_test)
        
    df_results['accord'] = [True if row['metadata'] == row['prediction'] else False for index, row in df_results.iterrows()]
    
    if kernel == 'linear':
        coefs = pipe.named_steps['svc'].coef_
        plot_coefficients(*coefs, data.columns, nb_coef)
        coef_names = data.columns
        return pipe, df_results, df_scores, coefs, coef_names

    return pipe, df_results, df_scores

In [4]:
from sklearn.metrics import classification_report, accuracy_score, make_scorer

def classification_report_with_accuracy_score(y_true, y_pred):
    print(classification_report(y_true, y_pred))
    return accuracy_score(y_true, y_pred)

In [10]:
df_main = pd.read_csv(r'/home/crazyjeannot/Downloads/features_gendered.csv')
df_main.drop(['Unnamed: 0'], axis = 1)
df_main.set_index("id", inplace = True)
df_main = df_main.replace(np.nan, 0)

In [13]:
df_main = df_main[(df_main.Gender == 'female') | (df_main.Gender == 'Female') | (df_main.Gender == 'male') | (df_main.Gender == 'Male')]
df_main['Gender'] = ['Female' if (elem == 'female') | (elem == 'Female') else 'Male' for elem in df_main['Gender']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_main['Gender'] = ['Female' if (elem == 'female') | (elem == 'Female') else 'Male' for elem in df_main['Gender']]


In [14]:
df_main.Gender.value_counts(normalize=True)

Male      0.561667
Female    0.438333
Name: Gender, dtype: float64