In [0]:
# %% [code]
### This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from sklearn.metrics import mean_squared_error, f1_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
data_path = os.path.join(dirname, filename)

dataset = pd.read_csv(data_path, index_col = 0)


# %% [code]
features = ['title', 'location', 'department', 'salary_range', 'company_profile', 'description', 'requirements', 'benefits', 'telecommuting', 'has_company_logo', 'has_questions', 'employment_type', 'required_experience', 'required_education', 'industry', 'function']
bool_features = ['telecommuting', 'has_company_logo', 'has_questions']
text_features = [ 'title','company_profile', 'description', 'requirements', 'benefits']
cat_features = ['employment_type', 'required_experience','required_education', 'industry', 'function']
cat_features_encode = ['employment_type_encode', 'required_experience_encode', 'required_education_encode', 'industry_encode', 'function_encode']
cat_extra_features = ['salary_min', 'salary_max', 'department']


# %% [code]
print(dataset)

# %% [code]
del dataset


# %% [code]
""" Missing values"""
print(dataset.isnull().sum())

for col in cat_features:
    dataset[col] = dataset[col].fillna('Unknown')   

for col in text_features:
    dataset[col] = dataset[col].fillna('')  

dataset['department'].fillna('Unknown')
defecto = -1
salarios = dataset['salary_range']
print(dataset)
print(salarios)

#print(dataset.salary_range.unique())

""" Como los salarios tienen diferentes rangos pero alguno de ellos contiene valores no apropiados, como Apr, Dec, etc. dividimos los rangos en dos columnas para poder analizarlos como contenido numerico y eliminar los contenidos no numericos"""


salarios = dataset['salary_range'].str.split("-", n = 1, expand = True) 
dataset.drop('salary_range', axis = 1, inplace = True)
dataset['salary_min'] = salarios[0]
dataset['salary_max'] = salarios[1]
    

""" Rellenamos los espacios nulos con -1"""
dataset['salary_min'].fillna(defecto, inplace = True)
dataset['salary_max'].fillna(defecto, inplace = True)





print(dataset.isnull().sum())
print(dataset)
print(dataset.groupby('salary_min').sum())
print(dataset.groupby('salary_max').sum())

# %% [code]
""" Sustituimos los valores no numericos por -1"""
meses = ['Dec', 'Jun', 'Oct', 'Sep', 'Nov', 'Apr']
            
for row in dataset.index:
    for col in meses:
        if(dataset['salary_min'][row] == col):
            dataset['salary_min'][row] = -1
        
        if(dataset['salary_max'][row] == col):
            dataset['salary_max'][row] = -1

print(dataset.groupby('salary_min').sum())     
print(dataset.groupby('salary_max').sum())  

# %% [code]



fig = plt.figure(figsize = (5,5))
ax = sns.countplot(x = dataset.fraudulent, data = dataset, palette = 'Set1')
plt.title('fraudulent distribution', fontsize = 15)
yes = (dataset.fraudulent.value_counts()[1]/dataset.fraudulent.count()) * 100
no = (dataset.fraudulent.value_counts()[0]/dataset.fraudulent.count()) * 100

rects = ax.patches

# Now make some labels
labels = [no, yes]

for rect, label in zip(rects, labels):
   height = rect.get_height()
   ax.text(rect.get_x() + rect.get_width()/2, height + 5,  '{:1.2f}%'.format(label), ha='center', va='bottom', fontsize = 12)



# %% [code]
plt.figure(figsize=(20,20))

#sns.countplot(x=dataset., data=dataset[bool_features], hue='fraudulent', palette='Set2')
# loop to get column and the count of plots
grid = gridspec.GridSpec(3, 3, wspace=0.5, hspace=0.5)

for i, col in enumerate(dataset[bool_features]): 
    fig = plt.subplot(grid[i]) # feeding the figure of grid
    sns.countplot(x=col, data=dataset, hue='fraudulent', palette = 'Accent')
    fig.set_title(f'{col} distribution') # title label
    fig.set_xlabel(f'{col} values') # x axis label
    fig.set_ylabel('Count') # y axis label
    plt.legend()
    #plt.xticks(rotation=90) 
    total = len(dataset)
    sizes=[] # Get highest values in y
    for p in fig.patches: # loop to all objects
        height = p.get_height()
        sizes.append(height)
        fig.text(p.get_x() + p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(height/total*100),
                ha="center") 
    fig.set_ylim(0, max(sizes) * 1.15) #set y limit based on highest heights
plt.show()


# %% [code]
plt.figure(figsize = (70,400))
grid = gridspec.GridSpec(3, 3, wspace = 1, hspace = 1)

for i, col in enumerate(dataset[cat_features]): 
    fig = plt.subplot(5,1,i+1,autoscale_on = True) # feeding the figure of grid
    fig.grid(i)
    sns.countplot(x = col, data = dataset, hue = 'fraudulent', palette = 'Set1')
    fig.set_title(f'{col} distribution', fontsize = 40) # title label
    fig.set_xlabel(f'{col} values', fontsize = 40) # x axis label
    fig.set_ylabel('Count', fontsize = 40) # y axis label
    plt.legend(fontsize = 40)
    fig.tick_params(labelsize = 35, labelrotation = 90)
    total = len(dataset)
    sizes = [] # Get highest values in y
    if (col == 'employment_type' or col == 'required_experience' or col == 'required_education'):
        for p in fig.patches: # loop to all objects
            height = p.get_height()
            sizes.append(height)
            fig.text(p.get_x() + p.get_width()/2.,
                    height + 3,
                    '{:1.2f}%'.format(height / total * 100),
                    ha = "center", fontsize = 40) 
        fig.set_ylim(0, max(sizes) * 1.15) #set y limit based on highest heights
    
plt.show()
#print(dataset.groupby('function').sum())
#print(dataset.groupby('salary_max').sum())
#print(dataset.groupby('has_questions').sum())
#print(dataset.groupby('telecommuting').sum())
#sns.barplot(x=x_train.telecommuting, y = y_train)
#sns.barplot(x = x_train.has_questions, y = y_train)
#print(dataset.groupby('required_education').sum())
#print(dataset.groupby('has_company_logo').sum())

#sns.heatmap(data = dataset)


# %% [code]
""" Vamos a analizar el numero de palabras en las diferentes columnas de texto y su posible relación con las ofertas fraudulentas """

for i, col in enumerate(text_features):
    fig, (ax, ay) = plt.subplots(ncols = 2,figsize = (10,4), dpi = 100)
    fig.tight_layout(pad = 4)
    palabras = dataset[dataset.fraudulent == 1][col].str.split().map(lambda num: len(num))
    palabras2 = dataset[dataset.fraudulent == 0][col].str.split().map(lambda num: len(num))
    
    ax.hist(x = palabras, bins = 20, color = 'teal')
    ax.set_title(f'{col} number of words in real offers', fontsize = 10)
    ax.set_xlabel('number of words', fontsize = 10)
    ax.set_ylabel('words count', fontsize = 10)
    ax.tick_params(labelsize = 10)
    
    ay.hist(x = palabras2, bins = 20, color = 'olive')   
    ay.set_title(f'{col} number of words in fake offers', fontsize = 10)
    ay.set_xlabel('number of words', fontsize = 10)
    ay.set_ylabel('number of offers with "x" words', fontsize = 10)
    ay.tick_params(labelsize = 10)

# %% [code]
cat_encoder = LabelEncoder()

for col in cat_features:
    dataset[col + '_encode'] = cat_encoder.fit_transform(dataset[col])

dataset['department_encode'] = cat_encoder.fit_transform(dataset['department'])
print(dataset.groupby('department_encode').sum())

# %% [code]


real_dataset = dataset[dataset['fraudulent'] == 0].copy()
fake_dataset = dataset[dataset['fraudulent'] == 1].copy()

real_sampled_dataset = real_dataset.sample(n = 3000, random_state = 42)

final_dataset = pd.concat([real_sampled_dataset, fake_dataset], axis=0)


seed_state = 315
random_state = 42



real_dataset = final_dataset[final_dataset['fraudulent']==0]
fake_dataset = final_dataset[final_dataset['fraudulent']==1]

y_real = real_dataset['fraudulent'].copy()
x_real = real_dataset.drop(['fraudulent'], axis=1)

y_fake = fake_dataset['fraudulent'].copy()
x_fake = fake_dataset.drop(['fraudulent'], axis=1)

x = pd.concat([x_real, x_fake])
y = pd.concat([y_real, y_fake])



x_real_tv, x_real_test, y_real_tv, y_real_test = train_test_split(x_real, y_real, test_size=0.3, random_state=seed_state)
x_real_train, x_real_val, y_real_train, y_real_val = train_test_split(x_real_tv, y_real_tv, test_size=0.2, random_state=seed_state)

x_fake_tv, x_fake_test, y_fake_tv, y_fake_test = train_test_split(x_fake, y_fake, test_size=0.3, random_state=seed_state)
x_fake_train, x_fake_val, y_fake_train, y_fake_val = train_test_split(x_fake_tv, y_fake_tv, test_size=0.2, random_state=seed_state)

x_train = pd.concat([x_real_train, x_fake_train])
y_train = pd.concat([y_real_train, y_fake_train])

x_val = pd.concat([x_real_val, x_fake_val])
y_val = pd.concat([y_real_val, y_fake_val])

x_test = pd.concat([x_real_test, x_fake_test])
y_test = pd.concat([y_real_test, y_fake_test])


x_train_text = x_train['text'].copy()
x_val_text = x_val['text'].copy()
x_test_text = x_test['text'].copy()

rf_features = cat_features_encode + bool_features

x_train_rf = x_train[rf_features].copy()
x_val_rf = x_train[rf_features].copy()
x_test_rf = x_train[rf_features].copy()

x_train_cat = x_train[cat_features_encode].copy()
x_val_cat = x_val[cat_features_encode].copy()
x_test_cat = x_test[cat_features_encode].copy()

x_train_bool = x_train[bool_features].copy()
x_val_bool = x_val[bool_features].copy()
x_test_bool = x_test[bool_features].copy()

#train_text , test_text ,train_category , test_category = train_test_split(dataset[text_features], dataset.fraudulent , test_size = 0.2 , random_state = 0)

# %% [code]
for col in text_features:
    dataset['text'] += dataset[col] + ' ' 

# %% [code]


# %% [code]
""" TF-IDF para analizar la tratar los datos de texto"""
vectorizer = TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3)) 

for n in dataset.index:
       
    train_vectors = vectorizer.fit_transform(x_train_text)
    feature_names = vectorizer.get_feature_names()
    dense = train_vectors.todense()
    denselist = dense.tolist()
    dat = pd.DataFrame(denselist, columns=feature_names)
test_vectors = vectorizer.transform(x_test_text)

print('Tfidf_train:',train_vectors.shape)
print('Tfidf_test:',test_vectors.shape)

# %% [code]
mnb = MultinomialNB()
tfidf = mnb.fit(train_vectors,y_train)
print(tfidf)
mnb_predict = mnb.predict(test_vectors)
f1_score(y_test,mnb_predict, average = 'weighted')
print('score: ', mnb_score)

matriz(y_test, mnb_predict.round(), 'Confusion Matrix: Text MultinomialNB')
#mnb_tfidf_report = classification_report(test_category,mnbpredict,target_names = ['0','1'])


# %% [code]
def matriz(y_valid, predict, title):
    i = predict.astype(int)
    cm = confusion_matrix(y_valid, i, labels = np.unique(y_valid))
    suma = np.sum(cm, axis = 1,keepdims = True)
    perc = cm / suma.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    rows, cols = cm.shape
    for row in range(rows):
        for col in range(cols):
            c = cm[row, col]
            p = perc[row, col]
            if row == col:
                s = suma[row]
                annot[row, col] = '%.1f%%\n%d/%d' % (p, c, s)
            elif c == 0:
                annot[row, col] = ''
            else:
                annot[row, col] = '%.1f%%\n%d' % (p, c)
        
    cm = pd.DataFrame(cm, index=np.unique(y_valid), columns=np.unique(y_valid))
    cm.index.name = 'Real values'
    cm.columns.name = 'Predicted values'
    fig, ax = plt.subplots(figsize = (14,14))
    plt.title(title)
    sns.heatmap(cm, cmap= "YlGnBu", annot=annot, fmt='', ax = ax)

# %% [code]

rf_cat = RandomForestClassifier()

#cat_x_train, cat_x_val, cat_y_train, cat_y_val = train_test_split(dataset[cat_features], y, random_state = 42)
                                                                 
#bool_x_train, bool_x_val, bool_y_train, bool_y_val = train_test_split(dataset[bool_features], y, random_state= 1)
#text_x_train, text_x_val, text_y_train, text_y_val = train_test_split(dataset[text_features], y, random_state = 1)
print(x_train_rf)
print(y_train)   

rf_cat.fit(x_train_cat, y_train)
predicted_val_cat = rf_cat.predict(x_val_cat)
predicted_test_cat = rf_cat.predict(x_test_cat)


f1_score(y_val, predicted_val_cat.round(), average = 'macro')
matriz(y_val, predicted_val_cat.round(), 'Confusion Matrix: categorical Random Forest Validation')


f1_score(y_test, predicted_test_cat.round(), average = 'macro')
matriz(y_test, predicted_test_cat.round(), 'Confusion Matrix: categorical Random Forest')



# %% [code]
rf_bool = RandomForestClassifier()

#cat_x_train, cat_x_val, cat_y_train, cat_y_val = train_test_split(dataset[cat_features], y, random_state = 42)
                                                                 
#bool_x_train, bool_x_val, bool_y_train, bool_y_val = train_test_split(dataset[bool_features], y, random_state= 1)
#text_x_train, text_x_val, text_y_train, text_y_val = train_test_split(dataset[text_features], y, random_state = 1)
print(x_train_rf)
print(y_train)   

rf_bool.fit(x_train_bool, y_train)
predicted_val_bool = rf_bool.predict(x_val_bool)
predicted_test_bool = rf_bool.predict(x_test_bool)

f1_score(y_val, predicted_val_bool.round(), average = 'macro')
matriz(y_val, predicted_val_bool.round(), 'Confusion Matrix: boolean Random Forest Validation')

f1_score(y_test,predicted_test_bool.round(), average = 'macro')
matriz(y_test, predicted_test_bool.round(), 'Confusion Matrix: boolean Random Forest Test')

# %% [code]
rf_sal = RandomForestClassifier()

#cat_x_train, cat_x_val, cat_y_train, cat_y_val = train_test_split(dataset[cat_features], y, random_state = 42)
                                                                 
#bool_x_train, bool_x_val, bool_y_train, bool_y_val = train_test_split(dataset[bool_features], y, random_state= 1)
#text_x_train, text_x_val, text_y_train, text_y_val = train_test_split(dataset[text_features], y, random_state = 1)
 

sal_features = ['salary_min', 'salary_max', 'department_encode']


x_train_sal = x_train[sal_features].copy()
x_val_sal = x_val[sal_features].copy()
x_test_sal = x_test[sal_features].copy()

rf_sal.fit(x_train_sal, y_train)

predicted_val_sal = rf_sal.predict(x_val_sal)
predicted_test_sal = rf_sal.predict(x_test_sal)

f1_score(y_val, predicted_val_sal.round(), average = 'macro')
matriz(y_val, predicted_val_sal.round(), 'Confusion Matrix: categorical Random Forest validation')

f1_score(y_test, predicted_test_sal.round(), average = 'macro')
matriz(y_test, predicted_test_sal.round(), 'Confusion Matrix: categorical Random Forest test')

# %% [code]
agg_val = pd.DataFrame()
agg_test = pd.DataFrame()
agg_val['cat_predictions_val'] = predicted_val_cat
agg_test['cat_predictions_test'] = predicted_test_cat
agg_val['bool_predictions_val'] = predicted_val_bool
agg_test['bool_predictions_test'] = predicted_test_bool
agg_val['sal_predictions_val'] = predicted_val_sal
agg_test['sal_predictions_test'] = predicted_test_sal

print(agg_val)
print(agg_test)

# %% [code]
lr = LogisticRegression(C=0.1, solver='lbfgs', max_iter=2000, verbose=0, n_jobs=-1)
lr.fit(agg_val, y_val)

predictions = lr.predict(agg_test)

f1_score(y_test, predictions, average = 'macro')
matriz(y_test, predictions, 'Confusion Matrix: Aggregate Model Final Predictions ')

# %% [code]


# Variables del csv que parecen tener mas relacion con las ofertas fraudulentas: Required_education = high school, has_company_logo = 0
plt.figure(figsize=(20,5))
#sns.lineplot(data = dataset)
#sns.barplot(x=x_train.has_company_logo, y=y_train)
# Create KNeighbors classifier object model
model = KNeighborsClassifier(n_neighbors=6) # default value for n_neighbors is 5

# Train the model using the training sets and check score
#model.fit(x_train, y_train)
#x_test=x_train
#Predict Output
#predicted= model.predict(x_test)

#sns.lineplot(data = dataset)

# %% [code]
