# My notebooks

In [1]:
import pandas as pd

def nbs_to_df(nbs):
    parsed_notebooks = [parse(a_notebook.split()) for a_notebook in nbs]
    df = pd.DataFrame(parsed_notebooks, columns=['title', 'slug', 'total_votes', 'date'])
    df = df[df['total_votes'] > 0 ]
    df['url'] = df.apply(to_url, axis=1)
    df = df.sort_values("title", ascending=True)
    return df

def parse(a_notebook):
    slug = a_notebook[0]
    total_votes = int(a_notebook[-1])
    title = ' '.join(a_notebook[1:-6])
    date = ' '.join(a_notebook[-3:-2])
    return (title, slug, total_votes, date)

def to_url(row, include_bullet=True, include_votes=True):
    txt = f"[{row.title}](http://kaggle.com/{row.slug})"
    if include_bullet:
        txt = f"* {txt}"
    if include_votes:
        txt = f"{txt} ({row.total_votes})"
    return txt


def get_section(df, title, subtitle, url, matcher, sort_by):
    asc = sort_by != 'total_votes'
    if type(matcher) == str:
        dfx = df[df['title'].str.contains(matcher)]
    else:
        dfx = df[df.title.apply(matcher)]
    dfx = dfx.sort_values(sort_by, ascending=asc)
    
    txt = f"## [{title}]({url})\n"
    txt += f"**{subtitle}**\n\n"
    
    txt += '\n'.join(dfx['url'])
    return txt


In [2]:
nbs = !kaggle kernels list -m --page-size 100 --sort-by voteCount
df = nbs_to_df(nbs[2:])
len(df)

54

# By Competition

In [3]:
def is_other_competitions(title):
    for t in [ "🦠", "🇮🇳", "🐠", "📖", "H&M", "💲", "🐦", "☣️"]:
        if t in title:
            return False
    return True


COMPETITIONS = [
    ("Sartorius - Cell Instance Segmentation", 
     "Detect single neuronal cells in microscopy images", 
     "https://www.kaggle.com/c/sartorius-cell-instance-segmentation", 
     "🦠",
     'total_votes'),
    ("chaii - Hindi and Tamil Question Answering", 
     "Identify the answer to questions found in Indian language passages", 
     "https://www.kaggle.com/c/chaii-hindi-and-tamil-question-answering", 
     "🇮🇳",
     'title'),
    
    ("Jigsaw Rate Severity of Toxic Comments",
    "Rank relative ratings of toxicity between comments",
    "https://www.kaggle.com/c/jigsaw-toxic-severity-rating",
     "☣️",
     'total_votes'),
    ("TensorFlow - Help Protect the Great Barrier Reef",
    "Detect crown-of-thorns starfish in underwater image data",
    "https://www.kaggle.com/c/tensorflow-great-barrier-reef",
     "🐠",
     "total_votes"),
    
    
    ("Feedback Prize - Evaluating Student Writing",
    "Analyze argumentative writing elements from students grade 6-12",
    "https://www.kaggle.com/c/feedback-prize-2021",
     "📖",
     "total_votes"),
     
    
    ("H&M Personalized Fashion Recommendations",
    "Provide product recommendations based on previous purchases",
    "https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations",
     "H&M",
     "total_votes"),
    
    ("G-Research Crypto Forecasting",
    "Use your ML expertise to predict real crypto market data",
    "https://www.kaggle.com/c/g-research-crypto-forecasting",
    "💲" ,
     "total_votes"),
    
    ("BirdCLEF 2022",
    "Identify bird calls in soundscapes",
    "https://www.kaggle.com/c/birdclef-2022",
    "🐦",
     "total_votes"),
    
    ("Other competitions",
     "",
     "",
     is_other_competitions,
     "title")
         
]


def by_competition():
    txt = "# Organized by competition\n\n"
    for title, subtitle, url, matching_str, sort_by in COMPETITIONS:
        txt += get_section(df, title, subtitle, url, matching_str, sort_by)
        txt += "\n\n"
    return txt

# Highlights

In [46]:
HIGHLIGHTS = [
    
    ("🦠 Sartorius - Starter Torch Mask R-CNN [LB=0.273]",
    """A self-contained, simple, pure Torch Mask R-CNN implementation baseline model for the competition [Sartorius - Cell Instance Segmentation](https://www.kaggle.com/c/sartorius-cell-instance-segmentation).
    This notebook has more than 600 forks and was a high-performing end-to-end simple solution for the problem at the very beginning of the competition."""),
    
    ("Jigsaw - Incredibly Simple Naive Bayes [0.768]",
    """A very simple naive bayes model with a high accuracy. 
    The condensed code has less than 20 lines and a slight -but smart- modification of it (adapted from a notebook by Jeremy Howard) would have landed a Bronze medal: [☣️ Jigsaw - Jeremy Howard's NB [0.79744 Private]](https://www.kaggle.com/julian3833/jigsaw-jeremy-howard-s-nb-0-79744-private)."""),
    ('PyTorch- "ShortFormer" w/Chunks - Train [0.624]',
    """A baseline model for the competition [Feedback Prize - Evaluating Student Writing](https://www.kaggle.com/c/feedback-prize-2021). 
    It approaches the problems as a token classification problem ("NER"-like) and builds a RoBERTa base model with `max_length=512`. 
    In order to do so, it manages the chunking with stride of the texts with length greater than 512 (and the posterior merge). 
    A discussion about Longformers is presented as well."""),    
    ('PyTorch- "ShortFormer" w/Chunks - Infer [0.624]',
    "The inference notebook for the previous model."),
    ("H&M - Implicit ALS model [0.014]",
    "Implicit ALS base model for the competition [H&M Personalized Fashion Recommendations](https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations)."),
    ("1 - Quick start: read csv and flatten json fields",
    """A simple utility function that flattens json fields in a CSV when loaded with `pandas` for the competition [Google Analytics Customer Revenue Prediction](https://www.kaggle.com/c/ga-customer-revenue-prediction). 
    This problem appeared quite fast, so everyone was getting blocked right after loading the CSV. 
    That is why the notebook has almost 500 upvotes and more than 550 forks having just 10 lines of code. The competition had a terrible leak."""),
    ("GPT-2 Large –774M– w/Pytorch: Not that impressive", 
     """In this old notebook I applied the out-of-the-box GPT-2 models (`gpt2`, `gpt2-medium`, and `gpt2-large`) on the samples in the original blog post ([Better Language Models and Their Implications](https://openai.com/blog/better-language-models/)) using `huggingface`'s [pytorch-transformers](https://github.com/huggingface/pytorch-transformers) library, with a pretty simple code based on the library's [Quick Start](https://huggingface.co/pytorch-transformers/quickstart.html). 
The conclussion was that the results were good but not that impressive as the blogpost suggested.""")    
]

def is_highlight(row):
    for h, _ in HIGHLIGHTS:
        if h in row['title']:
            return True
    else:
        return False
    
def get_highlight_order(row):
    for i, (h, _) in enumerate(HIGHLIGHTS):
        if h in row['title']:
            return i
    else:
        return 0

def get_subtitle(row):
    for h, subtitle in HIGHLIGHTS:
        if h in row['title']:
            return subtitle
    else:
        return ""
    
def highlights(df):
    df_highlights = df.loc[df.apply(is_highlight, axis=1), ['title', 'url', 'total_votes']].copy()
    df_highlights['order'] = df_highlights.apply(get_highlight_order, axis=1)
    df_highlights['subtitle'] = df_highlights.apply(get_subtitle, axis=1)
    df_highlights = df_highlights.sort_values("order")
    
    txt = "# Highlights\n\n"
    for _, row in df_highlights.iterrows():
        txt += row.url.replace("*", "##")
        txt += f"\n{row.subtitle}\n\n"
    return txt

# Create Markdown

In [54]:
txt_by_competition = by_competition()
txt_hightlights = highlights(df)
txt = txt_hightlights + "\n\n" + txt_by_competition
print(txt)

# Highlights

## [🦠 Sartorius - Starter Torch Mask R-CNN [LB=0.273]](http://kaggle.com/julian3833/sartorius-starter-torch-mask-r-cnn-lb-0-273) (278)
A self-contained, simple, pure Torch Mask R-CNN implementation baseline model for the competition [Sartorius - Cell Instance Segmentation](https://www.kaggle.com/c/sartorius-cell-instance-segmentation).
    This notebook has more than 600 forks and was a high-performing end-to-end simple solution for the problem at the very beginning of the competition.

## [☣️ Jigsaw - Incredibly Simple Naive Bayes [0.768]](http://kaggle.com/julian3833/jigsaw-incredibly-simple-naive-bayes-0-768) (186)
A very simple naive bayes model with a high accuracy. 
    The condensed code has less than 20 lines and a slight -but smart- modification of it (adapted from a notebook by Jeremy Howard) would have landed a Bronze medal: [☣️ Jigsaw - Jeremy Howard's NB [0.79744 Private]](https://www.kaggle.com/julian3833/jigsaw-jeremy-howard-s-nb-0-79744-private).

## [📖 Py

# Highlights

## [🦠 Sartorius - Starter Torch Mask R-CNN [LB=0.273]](http://kaggle.com/julian3833/sartorius-starter-torch-mask-r-cnn-lb-0-273) (278)
A self-contained, simple, pure Torch Mask R-CNN implementation baseline model for the competition [Sartorius - Cell Instance Segmentation](https://www.kaggle.com/c/sartorius-cell-instance-segmentation).
    This notebook has more than 600 forks and was a high-performing end-to-end simple solution for the problem at the very beginning of the competition.

## [☣️ Jigsaw - Incredibly Simple Naive Bayes [0.768]](http://kaggle.com/julian3833/jigsaw-incredibly-simple-naive-bayes-0-768) (186)
A very simple naive bayes model with a high accuracy. 
    The condensed code has less than 20 lines and a slight -but smart- modification of it (adapted from a notebook by Jeremy Howard) would have landed a Bronze medal: [☣️ Jigsaw - Jeremy Howard's NB [0.79744 Private]](https://www.kaggle.com/julian3833/jigsaw-jeremy-howard-s-nb-0-79744-private).

## [📖 PyTorch- "ShortFormer" w/Chunks - Train [0.624]](http://kaggle.com/julian3833/pytorch-shortformer-w-chunks-train-0-624) (82)
A baseline model for the competition [Feedback Prize - Evaluating Student Writing](https://www.kaggle.com/c/feedback-prize-2021). 
    It approaches the problems as a token classification problem ("NER"-like) and builds a RoBERTa base model with `max_length=512`. 
    In order to do so, it manages the chunking with stride of the texts with length greater than 512 (and the posterior merge). 
    A discussion about Longformers is presented as well.

## [📖 PyTorch- "ShortFormer" w/Chunks - Infer [0.624]](http://kaggle.com/julian3833/pytorch-shortformer-w-chunks-infer-0-624) (69)
The inference notebook for the previous model.

## [H&M - Implicit ALS model [0.014]](http://kaggle.com/julian3833/h-m-implicit-als-model-0-014) (163)
Implicit ALS base model for the competition [H&M Personalized Fashion Recommendations](https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations).

## [1 - Quick start: read csv and flatten json fields](http://kaggle.com/julian3833/1-quick-start-read-csv-and-flatten-json-fields) (491)
A simple utility function that flattens json fields in a CSV when loaded with `pandas` for the competition [Google Analytics Customer Revenue Prediction](https://www.kaggle.com/c/ga-customer-revenue-prediction). 
    This problem appeared quite fast, so everyone was getting blocked right after loading the CSV. 
    That is why the notebook has almost 500 upvotes and more than 550 forks having just 10 lines of code. The competition had a terrible leak.

## [GPT-2 Large –774M– w/Pytorch: Not that impressive](http://kaggle.com/julian3833/gpt-2-large-774m-w-pytorch-not-that-impressive) (14)
In this old notebook I applied the out-of-the-box GPT-2 models (`gpt2`, `gpt2-medium`, and `gpt2-large`) on the samples in the original blog post ([Better Language Models and Their Implications](https://openai.com/blog/better-language-models/)) using `huggingface`'s [pytorch-transformers](https://github.com/huggingface/pytorch-transformers) library, with a pretty simple code based on the library's [Quick Start](https://huggingface.co/pytorch-transformers/quickstart.html). 
The conclussion was that the results were good but not that impressive as the blogpost suggested.



# Organized by competition

## [Sartorius - Cell Instance Segmentation](https://www.kaggle.com/c/sartorius-cell-instance-segmentation)
**Detect single neuronal cells in microscopy images**

* [🦠 Sartorius - Starter Torch Mask R-CNN [LB=0.273]](http://kaggle.com/julian3833/sartorius-starter-torch-mask-r-cnn-lb-0-273) (278)
* [🦠 Sartorius - Starter Baseline Torch U-net [>0.0]](http://kaggle.com/julian3833/sartorius-starter-baseline-torch-u-net-0-0) (174)
* [🦠 Sartorius - Classifier + Mask R-CNN [LB=0.28]](http://kaggle.com/julian3833/sartorius-classifier-mask-r-cnn-lb-0-28) (115)
* [🦠 Sartorius - Resnet34 Classifier](http://kaggle.com/julian3833/sartorius-resnet34-classifier) (43)
* [🦠 Sartorius - 3-line overlap-removing function](http://kaggle.com/julian3833/sartorius-3-line-overlap-removing-function) (13)

## [chaii - Hindi and Tamil Question Answering](https://www.kaggle.com/c/chaii-hindi-and-tamil-question-answering)
**Identify the answer to questions found in Indian language passages**

* [1 - The competition [QA for QA noobs 🇮🇳]](http://kaggle.com/julian3833/1-the-competition-qa-for-qa-noobs) (36)
* [2 - The dataset [QA for QA noobs 🇮🇳]](http://kaggle.com/julian3833/2-the-dataset-qa-for-qa-noobs) (20)
* [3 - The metric (Jaccard) [QA for QA noobs 🇮🇳]](http://kaggle.com/julian3833/3-the-metric-jaccard-qa-for-qa-noobs) (19)
* [4 - Exploring Public Models [QA for QA noobs 🇮🇳]](http://kaggle.com/julian3833/4-exploring-public-models-qa-for-qa-noobs) (30)
* [5-🥇XLM-Roberta+Torch's extra data [LB:0.749] 🇮🇳](http://kaggle.com/julian3833/5-xlm-roberta-torch-s-extra-data-lb-0-749) (60)
* [6- 🤗 Pre & post-processing [QA for QA noobs 🇮🇳]](http://kaggle.com/julian3833/6-pre-post-processing-qa-for-qa-noobs) (29)
* [7 - Public Models Revisited [QA for QA noobs 🇮🇳]](http://kaggle.com/julian3833/7-public-models-revisited-qa-for-qa-noobs) (30)
* [Quick and Dirty Transliteration Tables 🇮🇳](http://kaggle.com/julian3833/quick-and-dirty-transliteration-tables) (13)

## [Jigsaw Rate Severity of Toxic Comments](https://www.kaggle.com/c/jigsaw-toxic-severity-rating)
**Rank relative ratings of toxicity between comments**

* [☣️ Jigsaw - Incredibly Simple Naive Bayes [0.768]](http://kaggle.com/julian3833/jigsaw-incredibly-simple-naive-bayes-0-768) (186)
* [☣️ Jigsaw - Early Ensemble [LB=0.836]](http://kaggle.com/julian3833/jigsaw-early-ensemble-lb-0-836) (103)
* [☣️ Jigsaw - 🤗 HF hub out-of-the-box models](http://kaggle.com/julian3833/jigsaw-hf-hub-out-of-the-box-models) (31)
* [☣️ Jigsaw - Explore Previous Competitions Datasets](http://kaggle.com/julian3833/jigsaw-explore-previous-competitions-datasets) (9)
* [☣️ Jigsaw - Random rank [LB=0.498]](http://kaggle.com/julian3833/jigsaw-random-rank-lb-0-498) (9)
* [☣️ Jigsaw - Jeremy Howard's NB [0.79744 Private]](http://kaggle.com/julian3833/jigsaw-jeremy-howard-s-nb-0-79744-private) (7)
* [☣️ Jigsaw - New Ensemble [LB=0.853]](http://kaggle.com/julian3833/jigsaw-new-ensemble-lb-0-853) (1)

## [TensorFlow - Help Protect the Great Barrier Reef](https://www.kaggle.com/c/tensorflow-great-barrier-reef)
**Detect crown-of-thorns starfish in underwater image data**

* [🐠 Reef - A CV strategy: subsequences!](http://kaggle.com/julian3833/reef-a-cv-strategy-subsequences) (221)
* [🐠 Reef- Starter Torch FasterRCNN Train [LB=0.416]](http://kaggle.com/julian3833/reef-starter-torch-fasterrcnn-train-lb-0-416) (136)
* [🐠 Reef- Starter Torch FasterRCNN Infer [LB=0.416]](http://kaggle.com/julian3833/reef-starter-torch-fasterrcnn-infer-lb-0-416) (90)
* [🐠 DETR - Detection Transformer - Train [0.189]](http://kaggle.com/julian3833/detr-detection-transformer-train-0-189) (32)
* [🐠 DETR - Detection Transformer - Infer [0.189]](http://kaggle.com/julian3833/detr-detection-transformer-infer-0-189) (27)
* [🐠 Reef - Minimal EDA](http://kaggle.com/julian3833/reef-minimal-eda) (4)

## [Feedback Prize - Evaluating Student Writing](https://www.kaggle.com/c/feedback-prize-2021)
**Analyze argumentative writing elements from students grade 6-12**

* [📖Feedback- Baseline🤗 Sentence Classifier [0.226]](http://kaggle.com/julian3833/feedback-baseline-sentence-classifier-0-226) (189)
* [📖 PyTorch- "ShortFormer" w/Chunks - Train [0.624]](http://kaggle.com/julian3833/pytorch-shortformer-w-chunks-train-0-624) (82)
* [📖 PyTorch- "ShortFormer" w/Chunks - Infer [0.624]](http://kaggle.com/julian3833/pytorch-shortformer-w-chunks-infer-0-624) (69)
* [📖 Topic Modeling with LDA](http://kaggle.com/julian3833/topic-modeling-with-lda) (15)
* [📖 Pytorch - ITPT - Intra-task pre-training](http://kaggle.com/julian3833/pytorch-itpt-intra-task-pre-training) (11)
* [📖 W&B-tracked Shortformers experiments](http://kaggle.com/julian3833/w-b-tracked-shortformers-experiments) (5)

## [H&M Personalized Fashion Recommendations](https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations)
**Provide product recommendations based on previous purchases**

* [H&M - Implicit ALS model [0.014]](http://kaggle.com/julian3833/h-m-implicit-als-model-0-014) (163)
* [H&M - Collaborative Filtering: User-user](http://kaggle.com/julian3833/h-m-collaborative-filtering-user-user) (63)
* [H&M - Content-based: 12 most popular items [0.007]](http://kaggle.com/julian3833/h-m-content-based-12-most-popular-items-0-007) (29)

## [G-Research Crypto Forecasting](https://www.kaggle.com/c/g-research-crypto-forecasting)
**Use your ML expertise to predict real crypto market data**

* [🪙💲 G-Research- Starter LGBM Pipeline](http://kaggle.com/julian3833/g-research-starter-lgbm-pipeline) (266)
* [🪙💲 G-Research- Using the overlap fully [LB=0.99]](http://kaggle.com/julian3833/g-research-using-the-overlap-fully-lb-0-99) (161)
* [🪙💲 Proposal for a meaningful LB + Strict LGBM](http://kaggle.com/julian3833/proposal-for-a-meaningful-lb-strict-lgbm) (120)
* [[S]🪙💲G-Research - Strict LGBM example [LB=0.017]](http://kaggle.com/julian3833/s-g-research-strict-lgbm-example-lb-0-017) (21)

## [BirdCLEF 2022](https://www.kaggle.com/c/birdclef-2022)
**Identify bird calls in soundscapes**

* [🐦 Audio 101. 2- Detailed EDA](http://kaggle.com/julian3833/audio-101-2-detailed-eda) (8)
* [🐦Audio 101. 1- Audio manipulation & musical notes](http://kaggle.com/julian3833/audio-101-1-audio-manipulation-musical-notes) (8)

## [Other competitions]()
****

* [1 - Quick start: read csv and flatten json fields](http://kaggle.com/julian3833/1-quick-start-read-csv-and-flatten-json-fields) (491)
* [1- Learning 🤗 - Out-of-the-box BERT [LB: 0.577]](http://kaggle.com/julian3833/1-learning-out-of-the-box-bert-lb-0-577) (11)
* [1- Learning 🤗 - Out-of-the-box BERT [LB: 0.8102]](http://kaggle.com/julian3833/1-learning-out-of-the-box-bert-lb-0-8102) (14)
* [2 - Quick study: LGBM, XGB and Catboost [LB: 1.66]](http://kaggle.com/julian3833/2-quick-study-lgbm-xgb-and-catboost-lb-1-66) (96)
* [2. Learning 🤗 - Out-of-the-box RoBERTa [LB: 0.53]](http://kaggle.com/julian3833/2-learning-out-of-the-box-roberta-lb-0-53) (18)
* [3- Learning 🤗 - Out-of-the-box Electra [LB: 0.58]](http://kaggle.com/julian3833/3-learning-out-of-the-box-electra-lb-0-58) (7)
* [GPT-2 Large –774M– w/Pytorch: Not that impressive](http://kaggle.com/julian3833/gpt-2-large-774m-w-pytorch-not-that-impressive) (14)
* [Index of my public notebooks](http://kaggle.com/julian3833/index-of-my-public-notebooks) (1)
* [🚢 1 - Loading and visualizing the images](http://kaggle.com/julian3833/1-loading-and-visualizing-the-images) (33)
* [🚢 2 - Understanding & plotting rle bounding boxes](http://kaggle.com/julian3833/2-understanding-plotting-rle-bounding-boxes) (61)
* [🚢 3 - Basic exploratory analysis](http://kaggle.com/julian3833/3-basic-exploratory-analysis) (30)
* [🚢 4 - Exploring public models](http://kaggle.com/julian3833/4-exploring-public-models) (44)
* [🚢 5 - Submitting the test file (1.0 public LB)](http://kaggle.com/julian3833/5-submitting-the-test-file-1-0-public-lb) (35)
