In [1]:
# Import dependencies
import pandas as pd

# Hide warning messages in notebook
import warnings
warnings.filterwarnings('ignore')

# Read the CSV and Perform Basic Data Cleaning

In [2]:
# Read in data
df = pd.read_csv("csv_output/claims_summary_uspc_df.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df.dropna(how='all')
df.head()

Unnamed: 0.1,Unnamed: 0,patent_number,uspc_class,claim_no,total_word_ct,total_char_ct,average_word_ct_eachclaim,average_char_ct_eachclaim
0,0,RE30349,220,6,3782,23649,630,3942
1,1,RE30153,144,14,3194,19884,228,1420
2,2,RE30744,365,10,2679,18543,268,1854
3,3,RE29796,194,6,1114,7151,186,1192
4,4,RE30870,137,9,1762,11244,196,1249


In [3]:
df.count()

Unnamed: 0                   14945
patent_number                14945
uspc_class                   14945
claim_no                     14945
total_word_ct                14945
total_char_ct                14945
average_word_ct_eachclaim    14945
average_char_ct_eachclaim    14945
dtype: int64

In [4]:
df.dtypes

Unnamed: 0                    int64
patent_number                object
uspc_class                   object
claim_no                      int64
total_word_ct                 int64
total_char_ct                 int64
average_word_ct_eachclaim     int64
average_char_ct_eachclaim     int64
dtype: object

In [5]:
print(df["uspc_class"].unique())

['220' '144' '365' '194' '137' '000' '435' '156' '340' '423' '425' '219'
 '428' '062' '131' '222' '307' '074' '128' '176' '148' '356' '123' '209'
 '270' '364' '114' '271' '361' '051' '260' '073' '008' '313' '525' '249'
 '166' '055' '214' '033' '264' '426' '030' '046' '132' '101' '060' '324'
 '315' '029' '072' '503' '250' '277' '999' '215' '358' '241' '061' '430'
 '273' '424' '075' '164' '210' '357' '119' '429' '604' '536' '303' '023'
 '285' '206' '198' '354' '474' '057' '070' '047' '208' '562' '310' '564'
 '523' '367' '254' '104' '418' '227' '308' '252' '200' '052' '228' '162'
 '290' '174' '028' '106' '325' '544' '528' '040' '414' '239' '244' '192'
 '034' '427' '017' '152' '343' '560' '014' '318' '083' '294' '177' '280'
 '112' '542' '011' '350' '053' '378' '339' 'PLT' '108' '016' '160' '175'
 '187' '346' '135' '272' '096' '233' '056' '422' '433' '024' '410' '415'
 '013' '297' '382' '403' '032' '091' '204' '355' '236' '071' '179' '134'
 '099' '085' '140' '065' '362' '526' '180' '455' '3

In [6]:
df = df[['uspc_class', 'claim_no', 'total_word_ct', 'total_char_ct', 'average_word_ct_eachclaim', 'average_char_ct_eachclaim']]

In [7]:
display(df)

Unnamed: 0,uspc_class,claim_no,total_word_ct,total_char_ct,average_word_ct_eachclaim,average_char_ct_eachclaim
0,220,6,3782,23649,630,3942
1,144,14,3194,19884,228,1420
2,365,10,2679,18543,268,1854
3,194,6,1114,7151,186,1192
4,137,9,1762,11244,196,1249
...,...,...,...,...,...,...
14940,D15,1,12,74,12,74
14941,D15,1,12,74,12,74
14942,D18,1,11,66,11,66
14943,D09,1,10,59,10,59


In [8]:
# All electrical communications uspc_class
df_limited_uspc = df.loc[(df["uspc_class"] == "370") | (df["uspc_class"] == "709") | (df["uspc_class"] == "345"), :]
df_limited_uspc

Unnamed: 0,uspc_class,claim_no,total_word_ct,total_char_ct,average_word_ct_eachclaim,average_char_ct_eachclaim
957,370,18,1047,7479,58,416
1266,370,13,4772,30552,367,2350
1277,370,29,2185,14492,75,500
1516,370,16,1617,11390,101,712
1653,370,15,2009,14170,134,945
...,...,...,...,...,...,...
14654,345,40,2732,15666,68,392
14657,709,19,1517,9427,80,496
14791,370,22,1291,7875,59,358
14792,345,28,2005,12222,72,436


# Select features (columns)

In [9]:
# Set X equal to the entire data set, except for the first column
X = df_limited_uspc.iloc[:,1:]

# Set y equal to the first column
y = df_limited_uspc.iloc[:,0]

In [10]:
y.value_counts()

370    348
709    162
345    158
Name: uspc_class, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [12]:
# Search for top 10 features according to feature importances
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X_train,y_train)
model.score(X_test,y_test)
# model.feature_importances_

0.40718562874251496

In [13]:
# Search for top 10 features according to feature importances
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X,y)
# model.score(X,y)
model.feature_importances_

array([0.17416771, 0.20945185, 0.20973277, 0.19934493, 0.20730274])

In [14]:
# Store the top 5 features as a series, using the column headers as the index
feat_imp = pd.Series(model.feature_importances_, index=X.columns).nlargest(5)
feat_imp

total_char_ct                0.209733
total_word_ct                0.209452
average_char_ct_eachclaim    0.207303
average_word_ct_eachclaim    0.199345
claim_no                     0.174168
dtype: float64

In [15]:
# Set features based on feature importances
X = df_limited_uspc[feat_imp.index]

# Use uspc_class for y values
y = df_limited_uspc['uspc_class']

# Create a Train Test Split

In [16]:
X

Unnamed: 0,total_char_ct,total_word_ct,average_char_ct_eachclaim,average_word_ct_eachclaim,claim_no
957,7479,1047,416,58,18
1266,30552,4772,2350,367,13
1277,14492,2185,500,75,29
1516,11390,1617,712,101,16
1653,14170,2009,945,134,15
...,...,...,...,...,...
14654,15666,2732,392,68,40
14657,9427,1517,496,80,19
14791,7875,1291,358,59,22
14792,12222,2005,436,72,28


In [17]:
y

957      370
1266     370
1277     370
1516     370
1653     370
        ... 
14654    345
14657    709
14791    370
14792    345
14801    370
Name: uspc_class, Length: 668, dtype: object

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
# X_train, X_test, y_train, y_test = train_test_split(X, y)

# Pre-processing

Scale the data using the MinMaxScaler

In [19]:
from sklearn.preprocessing import MinMaxScaler

# Create scaler object
X_scaler = MinMaxScaler().fit(X_train)

# Scale training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model
Using Random Forest


In [20]:
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier

In [21]:
y

957      370
1266     370
1277     370
1516     370
1653     370
        ... 
14654    345
14657    709
14791    370
14792    345
14801    370
Name: uspc_class, Length: 668, dtype: object

In [22]:
y_test

9583     370
14340    370
12441    345
9364     345
13102    345
        ... 
8961     370
11207    345
7681     709
11670    370
10959    370
Name: uspc_class, Length: 167, dtype: object

In [23]:
clf = GradientBoostingClassifier(n_estimators=2000, learning_rate=0.001, max_depth=3, random_state=0).fit(X_train_scaled, y_train)

In [24]:
# ??GradientBoostingClassifier

In [25]:
# Score the model
print(f"Training Data Score: {clf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {clf.score(X_test_scaled, y_test)}")

Training Data Score: 0.6107784431137725
Testing Data Score: 0.48502994011976047


# Predict

In [26]:
predictions = clf.predict(X_test_scaled)
df_pred = pd.DataFrame({"Actual":y_test, "Predicted":predictions}) 
pd.set_option("display.max_rows", None, "display.max_columns", None)
df_pred

Unnamed: 0,Actual,Predicted
9583,370,709
14340,370,370
12441,345,370
9364,345,370
13102,345,370
9558,370,370
8136,345,370
8449,345,370
6676,345,370
13429,370,370


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [None]:
from sklearn.model_selection import GridSearchCV

# Create the GridSearchCV model
param_grid = {'n_estimators':[1, 2, 4, 6, 8],
             'max_depth':[5, 8, 15, 25, 30],
             'min_samples_split':[2, 5, 10],
             'min_samples_leaf':[1, 2, 4]}

grid = GridSearchCV(rf, param_grid, verbose=3)

In [None]:
# Train the model with GridSearch
best_model = grid.fit(X_train_scaled, y_train)

In [None]:
print(best_model.best_params_)
print(best_model.best_score_)

In [None]:
# Make predictions
grid_predictions = best_model.predict(X_test_scaled)
df_grid = pd.DataFrame({"Actual":y_test, "Predicted":grid_predictions})
df_grid

In [None]:
# Score the model
best_model.score(X_test_scaled, y_test)

In [None]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, grid_predictions))