# Indian Patent Dataset Predictive Modeling
This notebook provides predictive modeling of the Indian Patent Dataset for the years 2010, 2011, and 2019. The purpose of this notebook is to build and evaluate predictive models to forecast future trends or classify patents.

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report, confusion_matrix
sns.set(style="whitegrid")

df_2010 = pd.read_csv('data/2010.csv')
df_2011 = pd.read_csv('data/2011.csv')
df_2019 = pd.read_csv('data/2019.csv')

df_2010['Year'] = 2010
df_2011['Year'] = 2011
df_2019['Year'] = 2019
df_all = pd.concat([df_2010, df_2011, df_2019])

df_all['Application Date'] = pd.to_datetime(df_all['Application Date'], errors='coerce')
df_all['Filing Month'] = df_all['Application Date'].dt.month
df_all['Filing Year'] = df_all['Application Date'].dt.year

## Data Preprocessing

In [25]:
df_model = df_all[['Filing Month', 'Filing Year', 'Field Of Invention', 'Status']]

df_model['Field Of Invention'] = df_model['Field Of Invention'].astype('category').cat.codes
df_model['Status'] = df_model['Status'].astype('category').cat.codes

df_model.dropna(inplace=True)

X = df_model.drop('Status', axis=1)
y = df_model['Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model['Field Of Invention'] = df_model['Field Of Invention'].astype('category').cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model['Status'] = df_model['Status'].astype('category').cat.codes
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model.dropna(inplace=True)


## Different Machine Learning Models

In [27]:
# Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
print(f'Linear Regression MSE: {mse_lr}')

Linear Regression MSE: 0.0


In [26]:
# Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f'Decision Tree Accuracy: {accuracy_dt}')
print(f'Classification Report: {classification_report(y_test, y_pred_dt)}')
print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred_dt)}')

Decision Tree Accuracy: 1.0
Classification Report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00      9539

    accuracy                           1.00      9539
   macro avg       1.00      1.00      1.00      9539
weighted avg       1.00      1.00      1.00      9539

Confusion Matrix: [[9539]]


In [28]:
# Random Forest Classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Random Forest Accuracy: {accuracy_rf}')
print(f'Classification Report: {classification_report(y_test, y_pred_rf)}')
print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred_rf)}')

Random Forest Accuracy: 1.0
Classification Report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00      9539

    accuracy                           1.00      9539
   macro avg       1.00      1.00      1.00      9539
weighted avg       1.00      1.00      1.00      9539

Confusion Matrix: [[9539]]


## Hyperparameter Tuning and Model Selection

In [24]:
# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_
y_pred_best_rf = best_rf.predict(X_test)
accuracy_best_rf = accuracy_score(y_test, y_pred_best_rf)
print(f'Best Random Forest Accuracy: {accuracy_best_rf}')
print(f'Classification Report: {classification_report(y_test, y_pred_best_rf)}')
print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred_best_rf)}')

Fitting 5 folds for each of 54 candidates, totalling 270 fits


90 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
63 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Debjit Mandal\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Debjit Mandal\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\Debjit Mandal\AppData\Local\Packages\PythonSoftwareFoundation.Pyth

Best Random Forest Accuracy: 1.0
Classification Report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00      9539

    accuracy                           1.00      9539
   macro avg       1.00      1.00      1.00      9539
weighted avg       1.00      1.00      1.00      9539

Confusion Matrix: [[9539]]
