## Kaggle Python public API for data collection : Notebook Classification TASK

In [1]:
DATA_PATH_KEYWORDS = '../data/search_keywords.csv'
DATA_PATH_NOTEBOOKS = '../data/'

Importing librairies

In [2]:
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi
import pandas as pd
import numpy as np

In [3]:
api = KaggleApi()
api.authenticate()

Read the file containing the search keywords used to retrieve notebooks 

In [5]:
keywords_df = pd.DataFrame(columns=['Category', 'Subcategory'])
keywords_df = pd.read_csv(DATA_PATH_KEYWORDS, sep=';') 
keywords_df

Unnamed: 0,subcategory,category
0,linear regression,regression
1,lasso regression,regression
2,randomforestregression,regression
3,ridge regression,regression
4,XGBRegressor,regression
...,...,...
89,sarsa,reinforcement learning
90,ddpg,reinforcement learning
91,qlearning,reinforcement learning
92,markov decision,reinforcement learning


The following function returns the category of a Notebook based on its subcategory

In [4]:
def search(sub,data):
    for i in data.index:
        if data.loc[i]['subcategory'] == sub:
            return data.loc[i]['category']

# List of notebooks

In [5]:
df = pd.DataFrame(columns=['title','subcategory','category'])

In [None]:
for keyword in keywords_df['subcategory']:
    for i in range(1,11):
        try :
                kernels = api.kernels_list(search = keyword, page=i)
                for kernel in kernels:
                    df.loc[len(df)]=[kernel,keyword,search(keyword,keywords_df)]
        except Exception as e:
            print('Kaggle API exception :', kernel, 'Notebook not found')

In [13]:
df.head(10)

Unnamed: 0,title,subcategory,category
0,Linear Regression Tutorial,linear regression,regression
1,Car Price Prediction (Linear Regression - RFE),linear regression,regression
2,Multiple Linear Regression,linear regression,regression
3,Simple Linear Regression for Salary Data,linear regression,regression
4,Price analysis and Linear Regression,linear regression,regression
5,Linear Regression Project,linear regression,regression
6,PyTorch basics - Linear Regression from scratch,linear regression,regression
7,FIFA in depth analysis with Linear Regression,linear regression,regression
8,Sales Prediction (Simple Linear Regression),linear regression,regression
9,Linear Regression,linear regression,regression


In [None]:
df.to_csv(DATA_PATH_NOTEBOOKS+'ntb_list.csv')

### Before collecting
- Delete duplicate notebooks : print duplicate categories => choose best category to keep
- Delete notebooks with non-english titles

In [8]:
df = pd.read_csv(DATA_PATH_NOTEBOOKS+'ntb_list.csv', sep=',', encoding='utf-8', index_col=0)
df.head()

Unnamed: 0,title,subcategory,category
0,Linear Regression Tutorial,linear regression,regression
1,Car Price Prediction (Linear Regression - RFE),linear regression,regression
2,Multiple Linear Regression,linear regression,regression
3,Simple Linear Regression for Salary Data,linear regression,regression
4,Price analysis and Linear Regression,linear regression,regression


In [9]:
# a dict with duplicate titles and their respective positions in the df
duplicates = df[df.duplicated('title', keep=False)].groupby('title').groups

In [10]:
def flatten_list(l):
    return [item for sublist in l for item in sublist]

In [73]:
import collections

def to_be_dropped_indices(dictionary):
    to_be_dropped = [] # indices to be dropped from original df
    temp = []

    for k,v in dictionary.items():
        duplicates_df = df[df.index.isin(v.values)]
        
        # if the rows have the same category, they are all dropped except the last one
        if len(set(list(duplicates_df.category))) == 1:
            to_be_dropped.append(list(duplicates_df.index[:len(duplicates_df.category)-1]))
        else: 
            # drop least common category for a notebook
            dup_counter = collections.Counter(duplicates_df.category)
            most_common = dup_counter.most_common()[0][0]
            how_many_common = dup_counter.most_common()[0][1]
            to_be_dropped.append(list(duplicates_df.index.where(duplicates_df['category'] != most_common).dropna()))
            # to_be_dropped.append(list(duplicates_df.index[:how_many_common].where(duplicates_df['category'] == most_common).dropna())) NOT SURE ABOUT THIS ONE !
            print(how_many_common)

    return flatten_list(to_be_dropped)
            
            


# len(flatten_list(to_be_dropped))

In [74]:
to_be_dropped_indices(duplicates)

4
1
1
4
1
1
3
2
1
1
1
1
1
1
1
1
2
1
1
2
1
1
1
6
1
3
4
6
2
6
3
2
1
2
2
1
1
3
1
1
3
1
1
3
1
2
5
2
1
1
2
1
1
1
1
1
1
1
1
1
2
1
3
1
1
3
2
2
1
2
2
2
3
7
7
1
2
1
3
6
2
1
1
1
1
1
3
1
2
1
1
1
1
2
2
3
1
2
4
1
1
1
1
1
1
3
1
2
2
1
1
1
1
2
1
1
2
5
4
1
1
2
2
1
1
1
1
3
2
2
1
1
1
1
4
1
2
1
1
1
1
1
1
2
4
6
3
2
1
2
1
1
1
1
1
1
6
3
2
4
10
1
1
2
1
3
2
1
2
4
1
1
1
3
1
1
1
1
3
1
3
1
1
1
1
1
10
14
2
3
2
3
6
1
2
4
1
1
1
1
2
2
1
1
2
1
1
1
1
2
1
1
2
1
1
1
1
1
1
1
2
2
1
1
2
1
1
1
1
1
3
2
1
1
1
1
1
1
1
3
1
2
2
2
10
1
1
1
1
1
1
2
1
2
4
2
2
1
1
1
8
1
1
1
5
1
1
4
1
2
2
2
2
1
1
1
1
1
2
1
1
4
4
2
1
1
3
1
1
2
1
1
2
1
1
1
1
3
1
1
1
1
1
1
2
1
1
4
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
2
1
1
1
2
1
1
1
3
1
2
1
1
1
1
2
2
1
1
1
3
1


['8646',
 '7627',
 '9350',
 '6759',
 '9313',
 '10416',
 '2799',
 '2196',
 '11264',
 '2471',
 '10699',
 '5971',
 '8918',
 '452',
 '1062',
 '646',
 '964',
 '10694',
 '2769',
 '2937',
 '9348',
 '10209',
 '8434',
 '2884',
 '3058',
 '3268',
 '3292',
 '3386',
 '3785',
 '3884',
 '3980',
 '4122',
 '4247',
 '4605',
 '4662',
 '1882',
 '7858',
 '7919',
 '9330',
 '10530',
 '11589',
 '1971',
 '10185',
 '9006',
 '1617',
 '1872',
 '2230',
 '2464',
 '6980',
 '6978',
 '10618',
 '10642',
 '7974',
 '10645',
 '10681',
 '9047',
 '9168',
 '10333',
 '2997',
 '11384',
 '9080',
 '9326',
 '10366',
 '1926',
 '2527',
 '10661',
 '9386',
 '8931',
 '3284',
 '3866',
 '2891',
 '3410',
 '3786',
 '3984',
 '4126',
 '4255',
 '9093',
 '5726',
 '734',
 '1008',
 '7797',
 '8748',
 '2029',
 '2465',
 '7810',
 '8959',
 '3081',
 '3083',
 '3088',
 '3096',
 '3097',
 '585',
 '9008',
 '1835',
 '1888',
 '2080',
 '2461',
 '9380',
 '1987',
 '2179',
 '3073',
 '10696',
 '2938',
 '11177',
 '11191',
 '246',
 '8989',
 '10275',
 '199',
 '5108

In [99]:
df[df.index.isin([1987, 2179, 2337])]

Unnamed: 0,title,subcategory,category
1987,A Journey through Titanic,svc,classification
2179,A Journey through Titanic,naive bayes,classification
2337,A Journey through Titanic,knn,classification


# Collecting notebooks by category

In [52]:
for keyword in keywords_df['subcategory']:
    for i in range(1,16):
        try :
                kernels = api.kernels_list(search = keyword, page=i)
                for kernel in kernels:
                        api.kernels_pull(kernel.ref, path = DATA_PATH_NOTEBOOKS + search(keyword,keywords_df))
        except Exception as e:
            print('Kaggle API exception : ', kernel.ref, 'Notebook not found')

Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook not found
Kaggle API exception : Notebook no

In [21]:
import re
s = '<a class=""anchor"" id="2"></a>hkdfv'
result = re.search(r'<a class=""anchor""(.*?)</a>', s).group(1)
result = re.sub(r'<a class=""anchor"".*?</a>'," ",s)
result

' hkdfv'