In [1]:
# Import dependencies
import pandas as pd

# Hide warning messages in notebook
import warnings
warnings.filterwarnings('ignore')

# Read the CSV and Perform Basic Data Cleaning

In [2]:
# Read in data
df = pd.read_csv("csv_output/claims_summary_uspc_df.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df.dropna(how='all')
df.head()

Unnamed: 0.1,Unnamed: 0,patent_number,uspc_class,claim_no,total_word_ct,total_char_ct,average_word_ct_eachclaim,average_char_ct_eachclaim
0,0,RE30349,220,6,3782,23649,630,3942
1,1,RE30153,144,14,3194,19884,228,1420
2,2,RE30744,365,10,2679,18543,268,1854
3,3,RE29796,194,6,1114,7151,186,1192
4,4,RE30870,137,9,1762,11244,196,1249


In [3]:
df.count()

Unnamed: 0                   14945
patent_number                14945
uspc_class                   14945
claim_no                     14945
total_word_ct                14945
total_char_ct                14945
average_word_ct_eachclaim    14945
average_char_ct_eachclaim    14945
dtype: int64

In [4]:
df.dtypes

Unnamed: 0                    int64
patent_number                object
uspc_class                   object
claim_no                      int64
total_word_ct                 int64
total_char_ct                 int64
average_word_ct_eachclaim     int64
average_char_ct_eachclaim     int64
dtype: object

In [5]:
print(df["uspc_class"].unique())

['220' '144' '365' '194' '137' '000' '435' '156' '340' '423' '425' '219'
 '428' '062' '131' '222' '307' '074' '128' '176' '148' '356' '123' '209'
 '270' '364' '114' '271' '361' '051' '260' '073' '008' '313' '525' '249'
 '166' '055' '214' '033' '264' '426' '030' '046' '132' '101' '060' '324'
 '315' '029' '072' '503' '250' '277' '999' '215' '358' '241' '061' '430'
 '273' '424' '075' '164' '210' '357' '119' '429' '604' '536' '303' '023'
 '285' '206' '198' '354' '474' '057' '070' '047' '208' '562' '310' '564'
 '523' '367' '254' '104' '418' '227' '308' '252' '200' '052' '228' '162'
 '290' '174' '028' '106' '325' '544' '528' '040' '414' '239' '244' '192'
 '034' '427' '017' '152' '343' '560' '014' '318' '083' '294' '177' '280'
 '112' '542' '011' '350' '053' '378' '339' 'PLT' '108' '016' '160' '175'
 '187' '346' '135' '272' '096' '233' '056' '422' '433' '024' '410' '415'
 '013' '297' '382' '403' '032' '091' '204' '355' '236' '071' '179' '134'
 '099' '085' '140' '065' '362' '526' '180' '455' '3

In [6]:
mechanism = df.loc[(df['uspc_class']) == '074', :]
mechanism 

Unnamed: 0.1,Unnamed: 0,patent_number,uspc_class,claim_no,total_word_ct,total_char_ct,average_word_ct_eachclaim,average_char_ct_eachclaim
22,22,RE30525,74,29,32068,227552,1106,7847
29,29,RE30135,74,50,41511,269290,830,5386
51,51,RE29872,74,13,1275,7787,98,599
94,94,RE30334,74,5,4294,28723,859,5745
122,122,RE30120,74,20,5351,32758,268,1638
133,133,RE29993,74,1,1763,11191,1763,11191
303,303,RE30440,74,13,2888,16858,222,1297
316,316,RE30423,74,4,798,6019,200,1505
674,674,RE30981,74,69,4750,29076,69,421
761,761,RE30932,74,1,1435,8887,1435,8887


In [7]:
mechanism.to_csv("csv_output/mechanism.csv")

In [8]:
df = df[['uspc_class', 'claim_no', 'total_word_ct', 'total_char_ct', 'average_word_ct_eachclaim', 'average_char_ct_eachclaim']]

In [9]:
display(df)

Unnamed: 0,uspc_class,claim_no,total_word_ct,total_char_ct,average_word_ct_eachclaim,average_char_ct_eachclaim
0,220,6,3782,23649,630,3942
1,144,14,3194,19884,228,1420
2,365,10,2679,18543,268,1854
3,194,6,1114,7151,186,1192
4,137,9,1762,11244,196,1249
...,...,...,...,...,...,...
14940,D15,1,12,74,12,74
14941,D15,1,12,74,12,74
14942,D18,1,11,66,11,66
14943,D09,1,10,59,10,59


In [10]:
df["uspc_class"].value_counts()

370      348
455      297
514      293
428      237
375      236
        ... 
000        1
227.0      1
470        1
089        1
347.0      1
Name: uspc_class, Length: 616, dtype: int64

In [11]:
df_groupby = df.groupby("uspc_class").agg({'claim_no': ['mean', 'min', 'max']})
df_groupby

Unnamed: 0_level_0,claim_no,claim_no,claim_no
Unnamed: 0_level_1,mean,min,max
uspc_class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
000,10.000000,10,10
002,20.500000,2,101
003,19.500000,12,27
004,18.153846,5,50
005,22.000000,4,64
...,...,...,...
D26,1.000000,1,1
D27,1.000000,1,1
D29,1.000000,1,1
D34,1.000000,1,1


In [13]:
df_groupby.reset_index(inplace=True)

In [14]:
df_groupby.loc[(df_groupby['uspc_class']) == '074', :]

Unnamed: 0_level_0,uspc_class,claim_no,claim_no,claim_no
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,max
55,74,21.98,1,77
