In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import seaborn as sns
from sklearn.model_selection import cross_val_score

from joblib import Memory
from scipy.sparse import hstack
from scipy.sparse import save_npz,load_npz

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
# Set up memory caching
memory = Memory('./cachedir', verbose=0)

## Given:
### [Real or Fake] : Fake Job Description Prediction
### This dataset contains 18K job descriptions out of which about 800 are fake. The data consists of both textual information and meta-information about the jobs. The dataset can be used to create classification models which can learn the job descriptions which are fraudulent.

## Goal:

Process the dataset to prepare features for a classification model that predicts whether job descriptions are fraudulent or real.

## Load and explore the data:

In [3]:
@memory.cache
def load_data():
    return pd.read_csv("../data/raw/fake_job_postings.csv", index_col=0)

# Load the DataFrame (will use cache if available)
df = load_data()

## Handle Missing Values

- drop high missing value columns
- drop rows depending on the importance of the columns and how much data is missing.
- fill missning values with placeholder or mode

### Drop rows/cols

### Impute missing values

Impute missing categorical and text columns with placeholders or mode

## Encode Categorical Variables:

Apply appropriate encoding techniques:
- **Binary columns**: Use direct mapping (0/1).
- **Ordinal columns**: Use label encoding.
- **Nominal columns**: Use One-Hot Encoding, target encoding or frequency encoding 


### Define lists for different types of columns

### Encode Ordinal columns using Label Encoding

### Encode Nominal columns

## Select features

### Feature Importance: 

Utilize algorithms that provide feature importance scores (like Random Forest or Gradient Boosting) to identify which features contribute most to the prediction.

```python

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame to display features and their importance
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)
```

## Text Data Cleaning
   - Remove special characters and unnecessary whitespace from text columns.
   - Convert text to lowercase for uniformity.
   - Remove leading/trainling whitespaces
   - Optionally, perform lemmatization or stemming on text data.

## Converting text data into TF-IDF features

There are 2 approaches:

- Separate TF-IDF for Each Feature:

    - Use a separate TfidfVectorizer for each text feature.
    - Fit and transform each feature individually and then concatenate the resulting TF-IDF matrices.

    ```python
    vectorizers = {}
    tfidf_matrices = []

    for feature in text_features:
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(df[feature])
        tfidf_matrices.append(tfidf_matrix)
        vectorizers[feature] = vectorizer

    X = hstack(tfidf_matrices)
    ```

- Combined TF-IDF:

    - Concatenate all text features into a single column and apply TfidfVectorizer once.
    - This approach captures interactions between different text features.

    ```python
    combined_text = df['feature1'] + ' ' + df['feature2'] + ' ' + df['feature3']
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(combined_text)
    ```

- Pipeline with ColumnTransformer:

    - Use ColumnTransformer from sklearn.compose to apply TfidfVectorizer to specific text features while leaving other features untouched.
    - This is useful when you have a mixed dataset with text and non-text features.

    ```python
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline

    text_features = ['feature1', 'feature2']
    non_text_features = ['feature3', 'feature4']

    preprocessor = ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(), text_features),
            ('non_text', 'passthrough', non_text_features)
        ]
    )

    pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
    X = pipeline.fit_transform(df)
    ```


## Deal with imbalanced dataset

### Resampling Techniques:

**Oversampling**:
Increase the number of instances in the minority class, typically by duplicating existing instances or generating synthetic samples (e.g., using SMOTE).

```python
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
```

**Undersampling**:
Reduce the number of instances in the majority class to balance the dataset, which may involve randomly removing samples.

```python
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)
```

**Combination of Over- and Undersampling**:
Use both techniques together to achieve a balance without excessive loss of information.

```python
rom imblearn.combine import SMOTEENN

# Assuming X and y are your features and target
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
```

## Save processed data

In [33]:
# Save the sparse matrix
save_npz('../data/processed/X_resampled.npz', X_resampled)

# Save y_resampled as a DataFrame (you can also use Series)
pd.Series(y_resampled).to_csv('../data/processedy_resampled.csv', index=False)

##  Load processed data

In [34]:
# Load the sparse matrix
X_resampled_loaded = load_npz('../data/processed/X_resampled.npz')

# Load the target variable
y_resampled_loaded = pd.read_csv('../data/processedy_resampled.csv').values.flatten()  # Use .values.flatten() if you need it as a 1D array