In [1]:
import sys

import tensorflow.keras
import pandas as pd
import sklearn as sk
import tensorflow as tf

print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tensorflow.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

Tensor Flow Version: 2.16.1
Keras Version: 3.3.3

Python 3.9.19 (main, Mar 21 2024, 12:07:41) 
[Clang 14.0.6 ]
Pandas 2.2.2
Scikit-Learn 1.4.2
GPU is available


# READING THE DATA

In [2]:
# import numpy as np
# import pandas as pd
# import sklearn
# import tensorflow as tf
# import matplotlib.pyplot as plt

# # Check for TensorFlow GPU access
# print(f"TensorFlow has access to the following devices:\n{tf.config.list_physical_devices()}")

# # See TensorFlow version
# print(f"TensorFlow version: {tf.__version__}")

In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize

In [4]:
df = pd.read_csv('/users/likeshkoya/downloads/train.csv')

# PREPROCESSING

## CHECKING AND REMOVING NULL VALUES

In [5]:
df.isna().sum()

teacher_id                                          0
teacher_prefix                                      3
school_state                                        0
project_submitted_datetime                          0
project_grade_category                              0
project_subject_categories                          0
project_subject_subcategories                       0
project_title                                       0
project_essay_1                                     0
project_essay_2                                     0
project_essay_3                                 84325
project_essay_4                                 84325
project_resource_summary                            0
teacher_number_of_previously_posted_projects        0
project_is_approved                                 0
total_quantity                                      0
total_price                                         0
dtype: int64

In [6]:
df.drop(['teacher_prefix'], axis=1, inplace=True)

In [7]:
df.fillna('',inplace=True)

## FEATURE ENGINEERING

In [8]:
df["full_eassay"] = df[["project_essay_1","project_essay_2","project_essay_3","project_essay_4","project_resource_summary","project_subject_categories","project_subject_subcategories", 'project_title']].agg(" ".join, axis=1)

In [9]:
df.drop(['project_essay_1', 'project_essay_2','project_essay_3','project_essay_4','project_resource_summary','project_subject_categories', 'project_subject_subcategories', 'project_title'], axis=1, inplace=True)

## FEATURE ENCODING

### removing the unnecessary data

In [10]:
df['full_eassay'] = df['full_eassay'].apply(lambda x: x.lower() if isinstance(x, str) else x)

In [11]:
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):

    words = nltk.word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    filtered_text = ' '.join(filtered_words)
    return filtered_text

df['full_eassay'] = df['full_eassay'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/likeshkoya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
import re

def remove_non_alpha(text):

    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    return cleaned_text

df['text_column_cleaned'] = df['full_eassay'].apply(remove_non_alpha)

In [13]:
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")

wnl = WordNetLemmatizer()
def lemmatize_word(word):
    return wnl.lemmatize(word, pos="v")

df['lemmatized_text'] = df['text_column_cleaned'].apply(lemmatize_word)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/likeshkoya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/likeshkoya/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [14]:
nltk.download('words')

english_words = set(nltk.corpus.words.words())

def extract_dictionary_words(text):
    
    words = nltk.word_tokenize(text)
    dict_words = [word for word in words if word.lower() in english_words]
    return dict_words

df['lemmatized_text'] = df['lemmatized_text'].apply(extract_dictionary_words)

[nltk_data] Downloading package words to
[nltk_data]     /Users/likeshkoya/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [15]:
def con(words):
    return ' '.join(words)

df['lemmatized_text'] = df['lemmatized_text'].apply(con)

In [16]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
cv.fit(df['lemmatized_text'])

def process_chunk(chunk):
   
    X_chunk = pd.DataFrame(cv.transform(chunk['lemmatized_text']).todense(), columns=sorted(cv.vocabulary_))
    return X_chunk

chunk_size = 1000

chunks = [df[i:i+chunk_size] for i in range(0, len(df), chunk_size)]

result_chunks = [process_chunk(chunk) for chunk in chunks]
final_result = pd.concat(result_chunks, ignore_index=True)

### Reducing the number of columns

In [17]:
column_sum=final_result.sum(axis=0)

In [18]:
def drop_columns_less_than_mean(df):
    
    column_sums = df.sum(axis=0)
    mean_column_sum = column_sums.mean()
    columns_to_drop = column_sums[column_sums < mean_column_sum*0.25].index
    
    df.drop(columns=columns_to_drop, inplace=True)
    return df
final_result=drop_columns_less_than_mean(final_result)

In [19]:
feature_names=final_result.columns

In [20]:
feature_names

Index(['ability', 'able', 'absent', 'absolute', 'absolutely', 'absorb',
       'abstract', 'abundance', 'abuse', 'academic',
       ...
       'yoga', 'york', 'young', 'younger', 'youth', 'zero', 'zest', 'zip',
       'zone', 'zoo'],
      dtype='object', length=3598)

In [21]:
df = pd.concat([df, final_result], axis = 1).drop(columns = ['lemmatized_text'])

### Encoding

In [22]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output = False).set_output(transform = 'pandas')
ohetransform = ohe.fit_transform(df[['school_state']])

In [23]:
ohe = OneHotEncoder(sparse_output = False).set_output(transform = 'pandas')
ohetransformed = ohe.fit_transform(df[['project_grade_category']])

In [24]:
ohe = OneHotEncoder(sparse_output = False).set_output(transform = 'pandas')
ohetransform = ohe.fit_transform(df[['school_state']])

In [25]:
def hex_to_decimal(hex):
    return int(hex, 16)

df['teacher_id_no'] = df['teacher_id'].apply(hex_to_decimal)

In [26]:
df.drop(['teacher_id'], axis=1, inplace=True)

### Concantination

In [27]:
df = pd.concat([df, ohetransform ], axis = 1).drop(columns = ['school_state'])

In [28]:
df = pd.concat([df, ohetransformed ], axis = 1).drop(columns = ['project_grade_category'])

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87398 entries, 0 to 87397
Columns: 3661 entries, project_submitted_datetime to project_grade_category_Grades PreK-2
dtypes: float64(56), int64(3601), object(4)
memory usage: 2.4+ GB


In [30]:
df.isna().sum().sum()

0

In [31]:
df.head()

Unnamed: 0,project_submitted_datetime,teacher_number_of_previously_posted_projects,project_is_approved,total_quantity,total_price,full_eassay,text_column_cleaned,ability,able,absent,...,school_state_VA,school_state_VT,school_state_WA,school_state_WI,school_state_WV,school_state_WY,project_grade_category_Grades 3-5,project_grade_category_Grades 6-8,project_grade_category_Grades 9-12,project_grade_category_Grades PreK-2
0,2016-06-20 23:05:15,0,1,1,264.99,"\ '' teacher next year , made science fun ? \ ...",teacher next year made science fun asked...,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2016-09-01 08:11:20,12,1,5,847.0,living poverty often means lack access things ...,living poverty often means lack access things ...,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2016-05-09 11:40:49,1,1,13,222.2,high school speech debate team . qualified alt...,high school speech debate team qualified alte...,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2016-11-27 18:15:15,0,1,5,318.97,students fun bunch diverse kids love school en...,students fun bunch diverse kids love school en...,0,3,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2016-07-07 19:20:49,0,0,25,606.41,students working hard increase reading levels ...,students working hard increase reading levels ...,1,3,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [32]:
df.drop(['project_submitted_datetime','full_eassay','text_column_cleaned'], axis=1, inplace=True)

In [33]:
# df.to_csv('like.csv', index=False)

# TRAINING

In [34]:
Y = df['project_is_approved']

In [35]:
X = df.drop('project_is_approved', axis = 1)

In [36]:
all_train_columns_names=X.columns

In [37]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

In [None]:

# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import precision_score, recall_score

# X_train, X_test, y_train, y_test = train_test_split(X_train_resampled, y_train_resampled, test_size=0.2, random_state=42)


# n_estimators = 100 
# learning_rate = 1.0  

# adaboost_clf = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)

# adaboost_clf.fit(X_train, y_train)

# y_pred = adaboost_clf.predict(X_test)

# precision = precision_score(y_test, y_pred)
# recall = recall_score(y_test, y_pred)

# print("Precision:", precision)
# print("Recall:", recall)

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.model_selection import train_test_split
import numpy as np

# Assuming X and Y are defined somewhere before this point
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train_resampled, y_train_resampled, test_size=0.2, random_state=42)

# Define the parameter grid to search
param_dist = {
    'n_estimators': randint(10, 200),  # Number of trees in the forest
    'max_depth': [None] + list(np.arange(3, 21)),  # Maximum depth of the tree
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider at every split
    'min_samples_split': randint(2, 20),  # Minimum number of samples required to split a node
    'min_samples_leaf': randint(1, 20),  # Minimum number of samples required at each leaf node
    'bootstrap': [True, False]  # Method of selecting samples for training each tree
}

# Create a Random Forest classifier instance
rf = RandomForestClassifier()

# Randomized search cross validation
random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=100, cv=5, verbose=1, random_state=42, n_jobs=-1)

# Fit the random search model
random_search.fit(X_train, y_train)

# Best parameters found
print("Best Parameters:", random_search.best_params_)

# Best cross-validation score found
print("Best Score:", random_search.best_score_)

# Get the best estimator
best_rf = random_search.best_estimator_

# Now you can use 'best_rf' for predictions or further analysis


Fitting 5 folds for each of 100 candidates, totalling 500 fits




# Testing

In [None]:
df_test = pd.read_csv('/users/likeshkoya/downloads/test.csv')

In [None]:
backup = df_test

# PREPROCESSING

## CHECKING AND REMOVING NULL VALUES

In [None]:
df_test.isna().sum()

In [None]:
df_test.drop(['teacher_prefix'], axis=1, inplace=True)

In [None]:
df_test.fillna('',inplace=True)

In [None]:
df_test.info()

## FEATURE ENGINEERING

In [None]:
df_test["full_eassay"] = df_test[["project_essay_1","project_essay_2","project_essay_3","project_essay_4","project_resource_summary","project_subject_categories","project_subject_subcategories", 'project_title']].agg(" ".join, axis=1)

In [None]:
df_test.drop(['project_essay_1', 'project_essay_2','project_essay_3','project_essay_4','project_resource_summary','project_subject_categories', 'project_subject_subcategories', 'project_title'], axis=1, inplace=True)

## FEATURE ENCODING

### removing the unnecessary data

In [None]:
df_test['full_eassay'] = df_test['full_eassay'].apply(lambda x: x.lower() if isinstance(x, str) else x)

In [None]:
df_test.info()

In [None]:
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):

    words = nltk.word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    filtered_text = ' '.join(filtered_words)
    return filtered_text

df_test['full_eassay'] = df_test['full_eassay'].apply(remove_stopwords)

In [None]:
df_test.info()

In [None]:
import re

def remove_non_alpha(text):

    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    return cleaned_text

df_test['text_column_cleaned'] = df_test['full_eassay'].apply(remove_non_alpha)

In [None]:
df_test.info()

In [None]:
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()
def lemmatize_word(word):
    return wnl.lemmatize(word, pos="v")

df_test['lemmatized_text'] = df_test['text_column_cleaned'].apply(lemmatize_word)

In [None]:
df_test.info()

In [None]:
nltk.download('words')

english_words = set(nltk.corpus.words.words())

def extract_dictionary_words(text):
    
    words = nltk.word_tokenize(text)
    dict_words = [word for word in words if word.lower() in english_words]
    return dict_words

df_test['lemmatized_text'] = df_test['lemmatized_text'].apply(extract_dictionary_words)

In [None]:
df_test.info()

In [None]:
def con(words):
    return ' '.join(words)

df_test['lemmatized_text'] = df_test['lemmatized_text'].apply(con)

In [None]:
df_test.info()

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
cv.fit(df_test['lemmatized_text'])

def process_chunk(chunk):
   
    X_chunk = pd.DataFrame(cv.transform(chunk['lemmatized_text']).todense(), columns=sorted(cv.vocabulary_))
    return X_chunk

chunk_size = 1000

chunks = [df_test[i:i+chunk_size] for i in range(0, len(df_test), chunk_size)]

result_chunks = [process_chunk(chunk) for chunk in chunks]
final_result = pd.concat(result_chunks, ignore_index=True)

In [None]:
df_test.info()

### Reducing the number of columns

In [None]:
type(feature_names)

In [None]:
df_test.info()

In [None]:
feature_names

In [None]:
df_test.info()

In [None]:
def drop_columns_not_in_train_data(df):
    
    columns_not_to_drop = feature_names
    
    to_drop_df = df.drop(columns=columns_not_to_drop)
    columns_to_drop = to_drop_df.columns
    df.drop(columns=columns_to_drop, inplace = True)
    return df
final_result=drop_columns_not_in_train_data(final_result)

In [None]:
df_test.info()

In [None]:
final_result.columns

In [None]:
feature_names

In [None]:
df_test = pd.concat([df_test, final_result], axis = 1).drop(columns = ['lemmatized_text'])

In [None]:
df_test.info()

### Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output = False).set_output(transform = 'pandas')
ohetransform_test_schoolstate = ohe.fit_transform(df_test[['school_state']])

In [None]:
df_test.info()

In [None]:
ohe = OneHotEncoder(sparse_output = False).set_output(transform = 'pandas')
ohetransform_test_grade = ohe.fit_transform(df_test[['project_grade_category']])

In [None]:
 df_test.info()

In [None]:
def hex_to_decimal(hex):
    return int(hex, 16)

df_test['teacher_id_no'] = df_test['teacher_id'].apply(hex_to_decimal)

In [None]:
df_test.info()

In [None]:
df_test.drop(['teacher_id'], axis=1, inplace=True)

In [None]:
df_test.info()

### Concantination

In [None]:
df_test = pd.concat([df_test, ohetransform_test_schoolstate ], axis = 1).drop(columns = ['school_state'])

In [None]:
df_test.info()

In [None]:
df_test = pd.concat([df_test, ohetransform_test_grade ], axis = 1).drop(columns = ['project_grade_category'])

In [None]:
df_test.info()

In [None]:
df_test.isna().sum().sum()

In [None]:
df_test.head()

In [None]:
df_test.drop(['project_submitted_datetime','full_eassay','text_column_cleaned'], axis=1, inplace=True)

In [None]:
def drop_columns_not_in_train_data(df):
    
    columns_not_to_drop = all_train_columns_names
    
    to_drop_df = df.drop(columns=columns_not_to_drop)
    columns_to_drop = to_drop_df.columns
    df.drop(columns=columns_to_drop, inplace = True)
    return df
df_test=drop_columns_not_in_train_data(df_test)

In [None]:
all_train_columns_names

In [None]:
df_test.columns

In [None]:
df_test.info()

In [None]:
df_test.fillna(0,inplace=True)

# Predicting 

In [None]:
y_prediction = best_rf.predict(df_test)

In [None]:
guhdjsahjk = pd.DataFrame(y_prediction)

In [None]:
backup=pd.concat([backup, guhdjsahjk ], axis=1)

In [None]:
guhdjsahjk.describe()

In [None]:
guhdjsahjk.sum()

In [None]:
guhdjsahjk

In [None]:
import pandas as pd

zeros_count = guhdjsahjk.iloc[:, 0].value_counts().get(0, 0)
ones_count = guhdjsahjk.iloc[:, 0].value_counts().get(1, 0)

print("Number of zeros in the first column:", zeros_count)
print("Number of ones in the first column:", ones_count)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [None]:
# predictions_df = pd.DataFrame({'predicted_label': y_prediction})

In [None]:
# predictions_df.to_csv('predict_se21uari210.csv', index=False)