In [None]:
import pandas as pd
from IPython.core.display import HTML

path = "../input/"

versions = pd.read_csv(path+"KernelVersions.csv")
kernels = pd.read_csv(path+"Kernels.csv")
users = pd.read_csv(path+"Users.csv")

language_map = {'1' : 'R','5' : 'R', '12' : 'R', '13' : 'R', '15' : 'R', '16' : 'R',
                '2' : 'Python','8' : 'Python', '9' : 'Python', '14' : 'Python'}

def pressence_check(title, tokens, ignore = []):
    present = False
    for token in tokens:
        words = token.split()
        if all(wrd.lower().strip() in title.lower() for wrd in words):
            present = True
    for token in ignore:
        if token in title.lower():
            present = False
    return present 

## check if the latest version of the kernel is about the same topic 
def get_latest(idd):
    latest = versions[versions['KernelId'] == idd].sort_values('VersionNumber', ascending = False).iloc(0)[0]
    return latest['VersionNumber']

def get_kernels(tokens, n, ignore = []):
    versions['isRel'] = versions['Title'].apply(lambda x : pressence_check(x, tokens, ignore))
    relevant = versions[versions['isRel'] == 1]
    results = relevant.groupby('KernelId').agg({'TotalVotes' : 'sum', 
                                                'KernelLanguageId' : 'max', 
                                                'Title' : lambda x : "#".join(x).split("#")[-1],
                                                'VersionNumber' : 'max'})
    results = results.reset_index().sort_values('TotalVotes', ascending = False).head(n)
    results = results.rename(columns={'KernelId' : 'Id', 'TotalVotes': 'Votes'})


    results['latest_version']  = results['Id'].apply(lambda x : get_latest(x))
    results['isLatest'] = results.apply(lambda r : 1 if r['VersionNumber'] == r['latest_version'] else 0, axis=1)
    results = results[results['isLatest'] == 1]

    results = results.merge(kernels, on="Id").sort_values('TotalVotes', ascending = False)
    results = results.merge(users.rename(columns={'Id':"AuthorUserId"}), on='AuthorUserId')
    results['Language'] = results['KernelLanguageId'].apply(lambda x : language_map[str(x)] if str(x) in language_map else "")
    results = results.sort_values("TotalVotes", ascending = False)
    return results[['Title', 'CurrentUrlSlug','Language' ,'TotalViews', 'TotalComments', 'TotalVotes', "DisplayName","UserName"]]


def best_kernels(tokens, n = 10, ignore = []):
    response = get_kernels(tokens, n, ignore)     
    hs = """<style>
                .rendered_html tr {font-size: 12px; text-align: left}
            </style>
            <h3><font color="#1768ea">"""+tokens[0].title()+"""</font></h3>
            <table>
            <th>
                <td><b>Kernel</b></td>
                <td><b>Author</b></td>
                <td><b>Language</b></td>
                <td><b>Views</b></td>
                <td><b>Comments</b></td>
                <td><b>Votes</b></td>
            </th>"""
    for i, row in response.iterrows():
        url = "https://www.kaggle.com/"+row['UserName']+"/"+row['CurrentUrlSlug']
        aurl= "https://www.kaggle.com/"+row['UserName']
        hs += """<tr>
                    <td>"""+str(i+1)+"""</td>
                    <td><a href="""+url+""" target="_blank"><b>"""  + row['Title'] + """</b></a></td>
                    <td><a href="""+aurl+""" target="_blank">"""  + row['DisplayName'] + """</a></td>
                    <td>"""+str(row['Language'])+"""</td>
                    <td>"""+str(row['TotalViews'])+"""</td>
                    <td>"""+str(row['TotalComments'])+"""</td>
                    <td>"""+str(row['TotalVotes'])+"""</td>
                    </tr>"""
    hs += "</table>"
    display(HTML(hs))

# Data Science Glossary on Kaggle

Kaggle is the place to do data science projects. There are so many algorithms and concepts to learn. Kaggle Kernels are one of the best resources on internet to understand the practical implementation of algorithms. There are almost 200,000 kernels published on kaggle and sometimes it becomes diffcult to search for the right implementation. I have used the [Meta Kaggle](https://www.kaggle.com/kaggle/meta-kaggle) database to create a glossary of data science models, techniques and tools shared on kaggle kernels. One can use this kernel as the one place to find other great kernels shared by great authors. Hope you like this kernel.  


## Contents 

<ul>
  <li>1. Regression Algorithms
    <ul>
    <li>1.1 Linear Regression</li>
    <li>1.2 Logistic Regression</li>
    </ul>
  </li>
    <li>2. Regularization Algorithms
    <ul>
    <li>2.1 Ridge Regression Regression</li>
    <li>2.2 Lasso Regression</li>
    <li>2.3 Elastic Net</li>
    </ul>
  </li>
  </li>
    <li>3. Tree Based Models
    <ul>
    <li>3.1 Decision Tree</li>
    <li>3.2 Random Forests</li>
    <li>3.3 Lightgbm</li>
    <li>3.4 XgBoost</li>
    <li>3.5 Cat Boost</li>
    <li>3.6 Gradient Boosting</li>
    </ul>
  </li>
<li>4. Neural Networks and Deep Learning
    <ul>
    <li>4.1 Neural Networks</li>
    <li>4.2 AutoEncoders</li>
    <li>4.3 DeepLearning</li>
    <li>4.4 Convolutional Neural Networks / CNN</li>
    <li>4.5 Recurrent Neural Networks / RNN</li>
    <li>4.6 LSTMs</li>
    <li>4.7 GRUs</li>
    <li>4.8 MxNet</li>
    <li>4.9 ResNet</li>
    <li>4.10 CapsuleNets</li>
    <li>4.11 Unet</li>
    <li>4.12 VGGs</li>
    <li>4.13 Unet</li>
    <li>4.14 Xception</li>
    <li>4.15 Inception Nets</li>
     <li>4.16 Computer Vision</li>
     <li>4.17 Transfer Learning</li>
     <li>4.18 RCNN</li>
     <li>4.19 Object Detection</li>
     <li>4.20 MobileNet </li>
     </ul>
  </li>
<li>5. Clustering Algorithms
    <ul>
    <li>5.1 K Means Clustering </li>
    <li>5.2 Hierarchial Clustering</li>
    <li>5.3 DB Scan</li>
    <li>5.4 Unsupervised Learning </li>
    </ul>
  </li>
  <li>6. Misc - Models
    <ul>
    <li>6.1 K Naive Bayes </li>
    <li>6.2 SVMs</li>
    <li>6.3 KNN</li>
    <li>6.4 Recommendation Engine </li>
    </ul>
  </li>
  <li>7.1 Data Science Techniques - Preprocessing
    <ul>
    <li>a. EDA, Exploration </li>
    <li>b. Feature Engineering </li>
    <li>c. Feature Selection </li>
    <li>d. Outlier Treatment</li>
    <li>e. Anomaly Detection</li>
    <li>f. SMOTE</li>
    <li>g. Pipeline</li>
    <li>g. Missing Values</li>
    </ul>
  </li>
  <li>7.2 Data Science Techniques - Dimentionality Reduction
    <ul>
    <li>a. Dataset Decomposition </li>
    <li>b. PCA </li>
    <li>c. Tsne </li>
    <li>d. SVD </li>
    </ul>
  </li>
  <li>7.3 Data Science Techniques - Post Modelling
    <ul>
    <li>a. Cross Validation </li>
    <li>b. Model Selection </li>
    <li>c. Model Tuning </li>
    <li>d. Grid Search </li>
    </ul>
  </li>
  <li>7.4 Data Science Techniques - Ensemblling
    <ul>
    <li>a. Ensembling </li>
    <li>b. Stacking </li>
    <li>c. Bagging</li>
    <li>d. Blending</li>
    </ul>
  </li>
  <li>8. Text Data 
    <ul>
    <li>8.1. NLP </li>
    <li>8.2. Topic Modelling </li>
    <li>8.3. Word Embeddings </li>
    <li>8.3. Spacy </li>
    <li>8.4. NLTK </li>
    <li>8.5. TextBlob </li>
    </ul>
  </li>
 <li>9. Data Science Tools 
    <ul>
    <li>9.1 Scikit Learn </li>
    <li>9.2 TensorFlow </li>
    <li>9.3 Theano </li>
    <li>9.4 Kears </li>
    <li>9.5 PyTorch </li>
    <li>9.6 Vopal Wabbit </li>
    <li>9.7 ELI5 </li>
    <li>9.8 HyperOpt </li>
    <li>9.9 Pandas </li>
    <li>9.10 Sql </li>
    <li>9.11 BigQuery </li>
    <li>9.12 H2o </li>
    <li>9.13 Fast.AI </li>
    </ul>
  </li>
<li>10. Data Visualizations 
    <ul>
    <li>10.1. Visualizations </li>
    <li>10.2. Plotly </li>
    <li>10.3. Seaborn </li>
    <li>10.4. D3.Js </li>
    <li>10.5. Bokeh </li>
    <li>10.6. Highchart </li>
    <li>10.7. Folium </li>
    <li>10.8. ggPlot </li>
    </ul>
  </li>
  <li>11. Time Series  
    <ul>
    <li>11.1. Time Series Analysis </li>
    <li>11.2. ARIMA </li>
    <li>11.3. Forecasting </li>
    </ul>
  </li>
    <li>12. Misc Materials  </li>
        <ul>
    <li>12.1. Best Tutorials on Kaggle </li>
    <li>12.2. Data Leak </li>
    </ul>


<br><br>

## 1. Regression Algorithms


In [None]:
tokens = ["linear regression"]
best_kernels(tokens, 10)

In [None]:
tokens = ['logistic regression', "logistic"]
best_kernels(tokens, 10)

## 2. Regularization Algorithms

In [None]:
tokens = ['Ridge']
best_kernels(tokens, 10)

In [None]:
tokens = ['Lasso']
best_kernels(tokens, 10)

In [None]:
tokens = ['ElasticNet']
best_kernels(tokens, 4)

## 3. Tree Based Models

In [None]:
tokens = ['Decision Tree']
best_kernels(tokens, 10)

In [None]:
tokens = ['random forest']
best_kernels(tokens, 10)

In [None]:
tokens = ['lightgbm', 'light gbm', 'lgb']
best_kernels(tokens, 10)

In [None]:
tokens = ['xgboost', 'xgb']
best_kernels(tokens, 10)

In [None]:
tokens = ['catboost']
best_kernels(tokens, 10)

In [None]:
tokens = ['gradient boosting']
best_kernels(tokens, 10)

## 4. Neural Networks and Deep Learning Models

In [None]:
tokens = ['neural network']
best_kernels(tokens, 10)

In [None]:
tokens = ['autoencoder']
best_kernels(tokens, 10)

In [None]:
tokens = ['deep learning']
best_kernels(tokens, 10)

In [None]:
tokens = ['convolutional neural networks', 'cnn']
best_kernels(tokens, 10)

In [None]:
tokens = ['recurrent','rnn']
best_kernels(tokens, 10)

In [None]:
tokens = ['lstm']
best_kernels(tokens, 10)

In [None]:
tokens = ['gru']
ignore = ['grupo']
best_kernels(tokens, 10, ignore)

In [None]:
tokens = ['mxnet']
best_kernels(tokens, 10)

In [None]:
tokens = ['resnet']
best_kernels(tokens, 10)

In [None]:
tokens = ['Capsule network', 'capsulenet']
best_kernels(tokens, 5)

In [None]:
tokens = ['vgg']
best_kernels(tokens, 5)

In [None]:
tokens = ['unet']
best_kernels(tokens, 10)

In [None]:
tokens = ['alexnet']
best_kernels(tokens, 5)

In [None]:
tokens = ['xception']
best_kernels(tokens, 5)

In [None]:
tokens = ['inception']
best_kernels(tokens, 5)

In [None]:
tokens = ['computer vision']
best_kernels(tokens, 5)

In [None]:
tokens = ['transfer']
best_kernels(tokens, 10)

In [None]:
tokens = ['yolo']
best_kernels(tokens, 5)

In [None]:
tokens = ['object detection']
best_kernels(tokens, 5)

In [None]:
tokens = ['rcnn']
best_kernels(tokens, 5)

In [None]:
tokens = ['mobilenet']
best_kernels(tokens, 5)

## 5. Clustering Algorithms 

In [None]:
tokens = ['kmeans', 'k means']
best_kernels(tokens, 10)

In [None]:
tokens = ['hierarchical clustering']
best_kernels(tokens, 3)

In [None]:
tokens = ['dbscan']
best_kernels(tokens, 10)

In [None]:
tokens = ['unsupervised']
best_kernels(tokens, 10)

## 6. Misc - Models 

In [None]:
tokens = ['naive bayes']
best_kernels(tokens, 10)

In [None]:
tokens = ['svm']
best_kernels(tokens, 10)

In [None]:
tokens = ['knn']
best_kernels(tokens, 10)

In [None]:
tokens = ['recommendation engine']
best_kernels(tokens, 5)

## 7. Important Data Science Techniques

### 7.1 Preprocessing

In [None]:
tokens = ['EDA', 'exploration', 'exploratory']
best_kernels(tokens, 10)

In [None]:
tokens = ['feature engineering']
best_kernels(tokens, 10)

In [None]:
tokens = ['feature selection']
best_kernels(tokens, 10)

In [None]:
tokens = ['outlier treatment', 'outlier']
best_kernels(tokens, 10)

In [None]:
tokens = ['anomaly detection', 'anomaly']
best_kernels(tokens, 8)

In [None]:
tokens = ['smote']
best_kernels(tokens, 5)

In [None]:
tokens = ['pipeline']
best_kernels(tokens, 10)

In [None]:
tokens = ['missing value']
best_kernels(tokens, 10)

### 7.2 Dimentionality Reduction

In [None]:
tokens = ['dataset decomposition', 'dimentionality reduction']
best_kernels(tokens, 2)

In [None]:
tokens = ['PCA']
best_kernels(tokens, 10)

In [None]:
tokens = ['Tsne', 't-sne']
best_kernels(tokens, 10)

In [None]:
tokens = ['svd']
best_kernels(tokens, 10)

### 7.3 Post Modelling Techniques

In [None]:
tokens = ['cross validation']
best_kernels(tokens, 10)

In [None]:
tokens = ['model selection']
best_kernels(tokens, 10)

In [None]:
tokens = ['model tuning', 'tuning']
best_kernels(tokens, 10)

In [None]:
tokens = ['gridsearch', 'grid search']
best_kernels(tokens, 10)

### 7.4 Ensemblling

In [None]:
tokens = ['ensemble']
best_kernels(tokens, 10)

In [None]:
tokens = ['stacking', 'stack']
best_kernels(tokens, 10)

In [None]:
tokens = ['bagging']
best_kernels(tokens, 10)

In [None]:
tokens = ['blend']
best_kernels(tokens, 10)

## 8. Text Data

In [None]:
tokens = ['NLP', 'Natural Language Processing', 'text mining']
best_kernels(tokens, 10)

In [None]:
tokens = ['topic modelling', 'lda']
best_kernels(tokens, 8)

In [None]:
tokens = ['word embedding','fasttext', 'glove', 'word2vec', 'word vector']
best_kernels(tokens, 8)

In [None]:
tokens = ['spacy']
best_kernels(tokens, 10)

In [None]:
tokens = ['nltk']
best_kernels(tokens, 5)

In [None]:
tokens = ['textblob']
best_kernels(tokens, 5)

## 9. Data Science Tools

In [None]:
tokens = ['scikit']
best_kernels(tokens, 10)

In [None]:
tokens = ['tensorflow', 'tensor flow']
best_kernels(tokens, 10)

In [None]:
tokens = ['theano']
best_kernels(tokens, 10)

In [None]:
tokens = ['keras']
best_kernels(tokens, 10)

In [None]:
tokens = ['pytorch']
best_kernels(tokens, 10)

In [None]:
tokens = ['vowpal wabbit','vowpalwabbit']
best_kernels(tokens, 10)

In [None]:
tokens = ['eli5']
best_kernels(tokens, 10)

In [None]:
tokens = ['hyperopt']
best_kernels(tokens, 5)

In [None]:
tokens = ['pandas']
best_kernels(tokens, 10)

In [None]:
tokens = ['SQL']
best_kernels(tokens, 10)

In [None]:
tokens = ['bigquery', 'big query']
best_kernels(tokens, 10)

In [None]:
tokens = ['gpu']
best_kernels(tokens, 10)

In [None]:
tokens = ['h20']
best_kernels(tokens, 5)

In [None]:
tokens = ['fastai', 'fast.ai']
best_kernels(tokens, 10)

## 10. Data Visualization

In [None]:
tokens = ['visualization', 'visualisation']
best_kernels(tokens, 10)

In [None]:
tokens = ['plotly', 'plot.ly']
best_kernels(tokens, 10)

In [None]:
tokens = ['seaborn']
best_kernels(tokens, 10)

In [None]:
tokens = ['d3.js']
best_kernels(tokens, 4)

In [None]:
tokens = ['bokeh']
best_kernels(tokens, 10)

In [None]:
tokens = ['highchart']
best_kernels(tokens, 10)

In [None]:
tokens = ['folium']
best_kernels(tokens, 5)

In [None]:
tokens = ['ggplot']
best_kernels(tokens, 10)

## 11. Time Series

In [None]:
tokens = ['time series']
best_kernels(tokens, 10)

In [None]:
tokens = ['arima']
best_kernels(tokens, 10)

In [None]:
tokens = ['forecasting']
best_kernels(tokens, 10)

## 12. Misc Materials 

### 12.1 Some of the Best Tutorials on Kaggle

In [None]:
tokens = ['tutorial']
best_kernels(tokens, 10)

### 12.2 Data Leak

In [None]:
tokens = ['data leak', 'leak']
best_kernels(tokens, 10)

<br>
Thanks for viewing. Suggest the list of items which can be added to the list. If you liked this kernel, **please upvote**   
