# Methods Crawling
Paper with Code의 https://paperswithcode.com/methods/ 를 뜯어보자!


## Category

## Library
- bs4
- 

## Environment
- Google Colab

# 참고 코드

##  Beatiful soup 



### 사용 예시 1 : class로 찾아 오기

``` python
import requests
from bs4 import BeautifulSoup

url = "https://www.kaggle.com/c/ranzcr-clip-catheter-line-classification/notebooks?competitionId=23870&sortBy=scoreDescending"

response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')#'html.parser')    
ul = soup.find('ul', class_="km-list km-list--avatar-list km-list--three-line")
print(soup)

```

### 사용 예시 2 : 테이블 데이터
- [image models](https://paperswithcode.com/methods/category/image-models)

```python
import requests
from bs4 import BeautifulSoup

url = 'https://paperswithcode.com/methods/category/image-models'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# table id 가 methodsTable 인 테이블 가져오기
table = soup.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']=="methodsTable")
rows = table.findAll(lambda tag: tag.name=='div')

model_names = [r.text.strip() for r in rows]
len(model_names)
```


### 예시3 : 태그로
```python
from bs4 import SoupStrainer

only_a_tags = SoupStrainer("a")
```

### 예시 4 : xpath 이용, 함수화
```python
def data_gathering():
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    # print(soup.prettify())

    ul = soup.find('ul', class_="km-list km-list--avatar-list km-list--three-line")
    li_all = ul.find_all('li')
    kernel_list = []
    for li in li_all:
        # try:
        # img_list = li.find('img').get('src')
        kernl_name = li.select('dl > dt > a')[0].text
        rating_list = ''.join([rating.text for rating in li.select('dl > dt > span')]).rstrip(' 관람가')
        star_rating_list = li.select('dl > dd > dl > dd > div > a > span.num')[0].text
        number_of_participants_list = li.select('dl > dd.star > dl.info_star > dd > div > a > span.num2 > em')[0].text
        genres_list = ','.join([genre.text for genre in li.select('dl > dd > dl > :nth-child(2) > span.link_txt > a')])
        creators_list = ','.join([creators.text for creators in li.select('dl > dd > dl > :nth-child(4) > span.link_txt > a')])
        stars_list = ','.join([star.text for star in li.select('li > dl > dd:nth-child(3) > dl > dd:nth-child(6) > span > a')])
        # except:
        #
        movie_list.append([img_list, movie_name_list, rating_list, genres_list, star_rating_list, number_of_participants_list, creators_list, stars_list])
    print(movie_list)
    return render_template("data_gathering.html", movie_list=movie_list)

```

## Selenium

### chromium-chromedriver 설치

- 설치
- 사용선언
- path 추가

```python
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
```

```python
from selenium import webdriver
import time
import datetime

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
```

### 사용 예시 1

```python
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
url = "https://www.kaggle.com/c/ranzcr-clip-catheter-line-classification/notebooks?competitionId=23870&sortBy=scoreDescending"
driver = webdriver.Chrome('chromedriver', chrome_options=chrome_options) #('/usr/lib/chromium-browser/chromedriver')

# 웹페이지 로딩시간을 고려해 implicitly_wait 타임을 줌
# wd.implicitly_wait(3)
driver.get(url)
```

### 함수 1 : 웹페이지 맨 하단까지 스크롤링
  - click_nolink_for_scrollDown : 웹페이지 맨 하단까지 스크롤링 (body 부분을 click)

```python
def click_nolink_for_scrollDown(driver, scrollDown_num=100):
    url = driver.current_url
    while True:
        try:
            body = driver.find_element_by_css_selector('body')
        except:
            driver.refresh()
            sleep(1)
        body.click()
        time.sleep(0.1)
        if url == driver.current_url:
            break
        else:
            driver.execute_script("window.history.go(-1)")
    time.sleep(0.1)
    for i in range(scrollDown_num):
        time.sleep(0.1)
        body.send_keys(Keys.PAGE_DOWN)

click_nolink_for_scrollDown(driver)
```

```python
ul = driver.find_element_by_xpath('//*[@id="site-content"]/div[2]/div/div[2]/div[1]/div/div/div[4]/div[2]/ul')
print(ul.text)
```

## 크롤링 데이터 편집 1 : 커널


- 클래스 Kernel 의 객체는 dataframe으로 이루어짐

  ``` python
  # Kernel.__init__() 테스트
  df = pd.DataFrame(columns=['rank', 'score', 'name', 'author', 'link', 'download'])
  df
  # Kernel.append() 테스트
  df1 = df.append(pd.Series([1,2,3,4,5,6],index=df.columns),ignore_index=True)
  df1

  ```
  
||rank|	score|	name|	author|	link|	download|
|-|-:|-:|-:|-:|-:|-:|
|0|	1|	2|	3|	4|	5|	6|

- 클래스 Code 구현예정

  ```python
  class Code:
      def __init__(self):
          self.model = None
          self.module = None
  ```

- 사용

  ```python
  import pandas as pd

  class Kernel:
      def __init__(self):
          self.columns = ['rank', 'score', 'author', 'name', 'link', 'download']
          self.df = pd.DataFrame(columns=self.columns)
      def append(self, rank, score, author, name, kernel_link, downlink):
          self.df = self.df.append(pd.Series([rank, score, author, name, kernel_link, downlink],index=self.df.columns),ignore_index=True)

  li = ul.find_elements_by_css_selector("li")
  kernel = Kernel()
  for i, l in enumerate(li):
    score = float(l.find_elements_by_css_selector("a > div > span:nth-child(3) > span > span:nth-child(1)")[0].text[-5:])
    if score > 0.75:
      # print(score)
      kernel_name = l.find_elements_by_css_selector("a > div > div")[0].text
      # print(kernel_name)
      href = l.find_elements_by_css_selector('a')
      author = href[0].get_attribute('href').split('/')[-1]
      kernel_link = href[1].get_attribute('href')
      downlink = href[-1].get_attribute('href')
      kernel.append(i, score, author, kernel_name, kernel_link, downlink)
      # print(author)
      # print(kernel_link,downlink )
    else:
      break

  kernel.df
  ```

## ipynb파일 다운링크로 받고 md파일로 변환

- kaggle 노트북에서 유용



### 파일 다운, md파일 변환
- 파일 다운링크, 파일 이름 필요
```python
import numpy as np
# idx = 0
# kernel.df.columns
# kernel.df['name'][idx]
# link = kernel.df['download'][idx]
link = '다운링크를 적어주세요'
name =  '파일이름' # 예시 : link.replace(' ','_') + '.ipynb'
!wget {link} -O {name}
!ipython nbconvert --to markdown {name}
!ls
```

### md파일 읽기
- 파일이름 필요

```python
idx = 0
with open(name.replace('ipynb', 'md')) as f:
    Tensorflow = False
    Pytorch = False
    while True:
        line = f.readline()
        
        if line:
            if 'import' in line:
                print(line.strip('\n'))
                if 'tensorflow' in line:
                    Tensorflow = True
                if 'torch' in line:
                    Pytorch = True
        else:
            break
    if Tensorflow == True and Pytorch == True: 
        test_df['framework'][idx] = 'Tensorflow/Pytorch'
    else:
      if Tensorflow == True and Pytorch == False:
          test_df['framework'][idx] = 'Tensorflow'
      elif Tensorflow == False and Pytorch == True:
          test_df['framework'][idx] = 'Pytorch'
```


# BeautifulSoup 으로 크롤링

## 안 쓴 방법 



soup = BeautifulSoup(markup=response.content, features='html.parser', parse_only=only_a_tags)


```python
for i, d in enumerate(div_categories):
  text = d.text.strip('\n').split()
  if text:
    print(i, int(text[-2]))
  else:
    count = 0
    for c in div.find_all('div', class_= "card-deck card-break infinite-item")[i]:
      try:
        if c.text:
          count += 1
      except:
        pass
    print(i, count)
```

```
0 57
1 46
2 17
3 14
4 9
5 7
6 2
```

## Methods

- 링크 : https://paperswithcode.com/methods/
- 

In [45]:
import requests
from bs4 import BeautifulSoup

url = "https://paperswithcode.com/methods/"
 
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
div = soup.find('div', class_= "infinite-container featured-methods")
div_group = div.find_all('div', class_= "row task-group-title") # "infinite-container featured-methods"
# div_categories = div.find_all('div', class_= "methods-all-tasks")

for i, d, in enumerate(div_group):
  print(i, d.text.strip('\n'))
  try:
    for a in d.select('a'):
        print(a.get('href'))
  except:
    print(dir(d))

In [38]:
for i, d, in enumerate(div_group):
  print(i, d.text.strip('\n'))

0 General
1 Computer Vision
2 Natural Language Processing
3 Reinforcement Learning
4 Audio
5 Sequential
6 Graphs


In [133]:
groups_li = []
for i, d, in enumerate(div_group):
    for a in d.select('a'):
        print(a.get('href'))
        groups_li.append(url[:-9] + a.get('href'))
print(groups_li)

/methods/area/general
/methods/area/computer-vision
/methods/area/natural-language-processing
/methods/area/reinforcement-learning
/methods/area/audio
/methods/area/sequential
/methods/area/graphs
['https://paperswithcode.com/methods/area/general', 'https://paperswithcode.com/methods/area/computer-vision', 'https://paperswithcode.com/methods/area/natural-language-processing', 'https://paperswithcode.com/methods/area/reinforcement-learning', 'https://paperswithcode.com/methods/area/audio', 'https://paperswithcode.com/methods/area/sequential', 'https://paperswithcode.com/methods/area/graphs']


In [140]:
url2 = groups_li[0]

response2 = requests.get(url2)
soup2 = BeautifulSoup(response2.content, 'html.parser')
# div = soup2.find('div', class_= "infinite-container featured-methods")
div_group2 = soup2.find_all('div', class_= "row task-group-title")

In [141]:
for i, d, in enumerate(div_group2):
  print(i, d.text.strip('\n'))

0 Optimization
1 Skip Connection Blocks
2 Stochastic Optimization
3 Attention
4 Activation Functions
5 Regularization
6 Normalization
7 Attention Mechanisms
8 Loss Functions
9 Attention Modules
10 Self-Supervised Learning
11 Neural Architecture Search
12 Learning Rate Schedules
13 Feedforward Networks
14 Adversarial Training
15 Miscellaneous Components
16 Interpretability
17 Output Functions
18 Semi-Supervised Learning Methods
19 Discriminators
20 Working Memory Models
21 Prioritized Sampling
22 Approximate Inference
23 Affinity Functions
24 Clustering
25 Hyperparameter Search
26 AutoML
27 Non-Parametric Regression
28 Non-Parametric Classification
29 Dimensionality Reduction
30 Initialization
31 Markov Chain Monte Carlo
32 Latent Variable Sampling
33 Skip Connections
34 Rule-based systems
35 Structured Prediction
36 Representation Learning
37 Large Batch Optimization
38 Generalized Additive Models
39 Fine-Tuning
40 Generalized Linear Models
41 Distributions
42 Meta-Learning Algorithms


In [170]:
groups_li = []
count=0
for i, d, in enumerate(div_group2):
    category = d.text.strip('\n')
    print(category)
    print(d.select('a')[0].get('href'))
#     for a in d.select('a'):
#         count+=1
#         print(count)
#         print(a.get('href'))
#         groups_li.append(url[:-9] + a.get('href'))
# print(groups_li)

Optimization
/methods/category/optimization
Skip Connection Blocks
/methods/category/skip-connection-blocks
Stochastic Optimization
/methods/category/stochastic-optimization
Attention
/methods/category/attention-mechanisms
Activation Functions
/methods/category/activation-functions
Regularization
/methods/category/regularization
Normalization
/methods/category/normalization
Attention Mechanisms
/methods/category/attention-mechanisms-1
Loss Functions
/methods/category/loss-functions
Attention Modules
/methods/category/attention-modules
Self-Supervised Learning
/methods/category/self-supervised-learning
Neural Architecture Search
/methods/category/neural-architecture-search
Learning Rate Schedules
/methods/category/learning-rate-schedules
Feedforward Networks
/methods/category/feedforward-networks
Adversarial Training
/methods/category/adversarial-training
Miscellaneous Components
/methods/category/miscellaneous-components
Interpretability
/methods/category/interpretability
Output Functi

In [142]:
groups_li = []
for i, d, in enumerate(div_group2):
    for a in d.select('a'):
        print(a.get('href'))
        groups_li.append(url[:-9] + a.get('href'))
print(groups_li)

/methods/category/optimization
/methods/category/skip-connection-blocks
/methods/category/stochastic-optimization
/methods/category/attention-mechanisms
/methods/category/activation-functions
/methods/category/regularization
/methods/category/normalization
/methods/category/attention-mechanisms-1
/methods/category/loss-functions
/methods/category/attention-modules
/methods/category/self-supervised-learning
/methods/category/neural-architecture-search
/methods/category/learning-rate-schedules
/methods/category/feedforward-networks
/methods/category/adversarial-training
/methods/category/miscellaneous-components
/methods/category/interpretability
/methods/category/output-functions
/methods/category/semi-supervised-learning-methods
/methods/category/discriminators
/methods/category/working-memory-models
/methods/category/prioritized-sampling
/methods/category/approximate-inference
/methods/category/affinity-functions
/methods/category/clustering
/methods/category/hyperparameter-search
/me

In [144]:
url3 = groups_li[0]

response3 = requests.get(url3)
soup3 = BeautifulSoup(response3.content, 'html.parser')
# div = soup2.find('div', class_= "infinite-container featured-methods")
# div_group3 = soup3.find_all('div', class_= "row task-group-title")

# table id 가 methodsTable 인 테이블 가져오기
table = soup3.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']=="methodsTable")
rows = table.findAll(lambda tag: tag.name=='div')
 
model_names = [r.text.strip() for r in rows]
print(model_names)
len(model_names)

['Adam', 'SGD', 'ADMM', 'RMSProp', 'SGD with Momentum', 'Random Search', 'AdaGrad', 'Gravity', 'SAGA', 'LAMB', 'TTUR', 'Adafactor', 'FA', 'Gradient Clipping', 'AMSGrad', 'Nesterov Accelerated Gradient', 'DFA', 'AdamW', 'LARS', 'Natural Gradient Descent', 'KP', 'Population Based Training', 'Stochastic Gradient Variational Bayes', 'Stochastic Weight Averaging', 'MAS', 'AdaBound', 'DAC', 'RAdam', 'Apollo', 'AdaDelta', 'OHEM', 'NT-ASGD', 'ATSS', 'PISA', 'Lookahead', 'Demon', 'NADAM', 'SLR', 'AdaMax', 'IoU-Balanced Sampling', 'Polyak Averaging', 'QHM', 'AdaShift', 'Tree-structured Parzen Estimator Approach (TPE)', 'QHAdam', 'SGDW', 'AggMo', 'Demon CM', 'YellowFin', 'Demon ADAM', 'AMSBound', 'AdaMod', 'AdaSqrt', 'AdaHessian', 'MPSO', 'Distributed Shampoo', 'Harris Hawks optimization (HHO)', 'Grammatical evolution + Q-learning', 'Differentiable Hyperparameter Search', 'SM3', 'MADGRAD', 'HFPSO']


62

In [162]:
bool(None)

False

In [165]:
odd = True
print(odd)
odd = ~odd
print(odd)
odd = ~odd
print(odd)
odd = ~odd
print(odd)
odd = ~odd
print(odd)

True
-2
1
-2
1


In [161]:
~False, ~~~True

(-1, -2)

In [153]:
test_rows = table.findAll(lambda tag: tag.name=='td')
count = 0
for r in test_rows:
  e = r.text.strip()
  try:
    print(int(e))
  except:
    if e:
      print(e)
      count += 1

1 Adam
2014
3953
2 SGD
1951
923
3 ADMM
2000
232
4 RMSProp
2013
210
5 SGD with Momentum
1999
121
6 Random Search
2000
116
7 AdaGrad
2011
78
8 Gravity
2021
67
9 SAGA
2014
60
10 LAMB
2019
58
11 TTUR
2017
43
12 Adafactor
2018
40
13 FA
2014
39
14 Gradient Clipping
2000
34
15 AMSGrad
2019
25
16 Nesterov Accelerated Gradient
1983
20
17 DFA
2016
19
18 AdamW
2017
17
19 LARS
2017
16
20 Natural Gradient Descent
1998
15
21 KP
2019
15
22 Population Based Training
2017
12
23 Stochastic Gradient Variational Bayes
2013
12
24 Stochastic Weight Averaging
2018
10
25 MAS
2020
9
26 AdaBound
2019
8
27 DAC
2020
8
28 RAdam
2019
7
29 Apollo
2020
7
30 AdaDelta
2012
6
31 OHEM
2016
6
32 NT-ASGD
2017
6
33 ATSS
2019
5
34 PISA
2019
5
35 Lookahead
2019
4
36 Demon
2019
3
37 NADAM
2015
3
38 SLR
2020
3
39 AdaMax
2014
2
40 IoU-Balanced Sampling
2019
2
41 Polyak Averaging
1991
2
42 QHM
2018
2
43 AdaShift
2018
2
44 Tree-structured Parzen Estimator Approach (TPE)
2013
2
45 QHAdam
2018
1
46 SGDW
2017
1
47 AggMo
2018
1
48 Dem

In [None]:
import pandas as pd
class ML_Methods:
    def __init__(self):
        self.columns = ['area', 'category', 'method', 'year', 'papers']
        self.df = pd.DataFrame(columns=self.columns)
    def append(self, area, category, method, year, papers):
        self.df = self.df.append(pd.Series([area, category, method, year, papers],index=self.df.columns),ignore_index=True)

In [None]:
ml_methods = Methods()
for i, l in enumerate(li):
  score = float(l.find_elements_by_css_selector("a > div > span:nth-child(3) > span > span:nth-child(1)")[0].text[-5:])
  if score > 0.75:
    # print(score)
    kernel_name = l.find_elements_by_css_selector("a > div > div")[0].text
    # print(kernel_name)
    href = l.find_elements_by_css_selector('a')
    author = href[0].get_attribute('href').split('/')[-1]
    kernel_link = href[1].get_attribute('href')
    downlink = href[-1].get_attribute('href')
    kernel.append(i, score, author, kernel_name, kernel_link, downlink)
    # print(author)
    # print(kernel_link,downlink )
  else:
    break
 
kernel.df

In [72]:
~0

-1

In [19]:
import requests
from bs4 import BeautifulSoup

import pandas as pd

url = "https://paperswithcode.com/methods/"


def get_mathod(url3):
    response3 = requests.get(url3)
    soup3 = BeautifulSoup(response3.content, 'html.parser')

    # table id 가 methodsTable 인 테이블 가져오기
    table = soup3.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']=="methodsTable")

    rows = table.findAll(lambda tag: tag.name=='td')

    odd = 1
    method_df = pd.DataFrame(columns=['method', 'year', 'papers'])
    method, year, papers = None, None, None
    count = 0
    for r in rows:
        e = r.text.strip()
        try:
            e = int(e)
            if odd  > 0 :
                year = e
                odd = ~odd
                count += 1
            else:
                papers = e
                odd = ~odd
                count += 1
                if count%3 == 0:
                    # print(method, year, papers)
                    method_df = method_df.append(pd.Series([method, year, papers],index=method_df.columns),ignore_index=True)
                
        except:
            if e:
                method = e
                count += 1
        
            
    
    # print(method_df.tail(5))

    return method_df #method_li


def get_category(url2):
    response2 = requests.get(url2)
    soup2 = BeautifulSoup(response2.content, 'html.parser')
    div_group2 = soup2.find_all('div', class_= "row task-group-title")

    flug = True
    for i, d, in enumerate(div_group2):
        category = d.text.strip('\n')
        href = d.select('a')[0].get('href')
        url3 = url[:-9] + href

        if flug:
            method_df = get_mathod(url3)
            method_df['category'] = category
            flug = False
        else :
            add_df = get_mathod(url3)
            add_df['category'] = category
            method_df = pd.concat([method_df,add_df])

    # print(method_df.tail(5))

    return method_df


def get_area(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    div_group = soup.find_all('div', class_= "row task-group-title")

    flug = True
    for i, d, in enumerate(div_group):
        area = d.text.strip('\n')
        href = d.select('a')[0].get('href')
        # area = href.split('/')[-1]
        url2 = url[:-9] + href

        if flug:
            method_df = get_category(url2)
            method_df['area'] = area
            flug = False
        else :
            add_df = get_category(url2)
            add_df['area'] = area
            method_df = pd.concat([method_df,add_df])		

    # print(method_df.tail(5))

    return method_df


method_df = get_area(url)
method_df

Unnamed: 0,method,year,papers,category,area
0,Adam,2014,3953,Optimization,General
1,SGD,1951,923,Optimization,General
2,ADMM,2000,232,Optimization,General
3,RMSProp,2013,210,Optimization,General
4,SGD with Momentum,1999,121,Optimization,General
...,...,...,...,...,...
7,RE-NET,2019,2,Graph Models,Graphs
8,Symbolic Deep Learning,2020,1,Graph Models,Graphs
9,HMGNN,2020,1,Graph Models,Graphs
10,CGNN,2019,1,Graph Models,Graphs


In [20]:
df = method_df.copy()

In [27]:
df.describe

<bound method NDFrame.describe of                     method  year papers      category     area
0                     Adam  2014   3953  Optimization  General
1                      SGD  1951    923  Optimization  General
2                     ADMM  2000    232  Optimization  General
3                  RMSProp  2013    210  Optimization  General
4        SGD with Momentum  1999    121  Optimization  General
..                     ...   ...    ...           ...      ...
7                   RE-NET  2019      2  Graph Models   Graphs
8   Symbolic Deep Learning  2020      1  Graph Models   Graphs
9                    HMGNN  2020      1  Graph Models   Graphs
10                    CGNN  2019      1  Graph Models   Graphs
11                  MXMNet  2020      1  Graph Models   Graphs

[1552 rows x 5 columns]>

In [21]:
df1 = df.groupby(['area', 'category']).size()
df1

area        category                   
Audio       Audio Artifact Removal          1
            Audio Model Blocks             11
            Generative Audio Models         9
            Music source separation         1
            Phase Reconstruction            1
                                           ..
Sequential  Recurrent Neural Networks      30
            Sequence To Sequence Models    12
            Sequential Blocks               1
            Temporal Convolutions           7
            Time Series Analysis            5
Length: 152, dtype: int64

In [22]:
df1.loc['Computer Vision', 'Image Models']

98

In [29]:
df2 = df.set_index(['area', 'category'])
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,method,year,papers
area,category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
General,Optimization,Adam,2014,3953
General,Optimization,SGD,1951,923
General,Optimization,ADMM,2000,232
General,Optimization,RMSProp,2013,210
General,Optimization,SGD with Momentum,1999,121
...,...,...,...,...
Graphs,Graph Models,RE-NET,2019,2
Graphs,Graph Models,Symbolic Deep Learning,2020,1
Graphs,Graph Models,HMGNN,2020,1
Graphs,Graph Models,CGNN,2019,1


In [30]:
len(set(df2.index))

152

In [31]:
df2.index.unique(level=0)

Index(['General', 'Computer Vision', 'Natural Language Processing',
       'Reinforcement Learning', 'Audio', 'Sequential', 'Graphs'],
      dtype='object', name='area')

In [32]:
df2.index.unique(level=1)

Index(['Optimization', 'Skip Connection Blocks', 'Stochastic Optimization',
       'Attention', 'Activation Functions', 'Regularization', 'Normalization',
       'Attention Mechanisms', 'Loss Functions', 'Attention Modules',
       ...
       'Speech Synthesis Blocks', 'Recurrent Neural Networks',
       'Sequence To Sequence Models', 'Temporal Convolutions',
       'Time Series Analysis', 'Bidirectional Recurrent Neural Networks',
       'Generative Sequence Models', 'Sequential Blocks', 'Graph Embeddings',
       'Graph Models'],
      dtype='object', name='category', length=152)

In [37]:
dic = {'General':0, 'Computer Vision':0, 'Natural Language Processing':0,'Reinforcement Learning':0, 'Audio':0, 'Sequential':0, 'Graphs':0}

for area, category in list(df2.index.drop_duplicates()):
    dic[area] +=1
print(dic)

{'General': 57, 'Computer Vision': 46, 'Natural Language Processing': 17, 'Reinforcement Learning': 14, 'Audio': 9, 'Sequential': 7, 'Graphs': 2}


In [33]:
image_models_df = df2.loc['Computer Vision', 'Image Models']
image_models_df

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,method,year,papers
area,category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Computer Vision,Image Models,ResNet,2015,1074
Computer Vision,Image Models,AlexNet,2012,287
Computer Vision,Image Models,VGG,2014,273
Computer Vision,Image Models,DenseNet,2016,232
Computer Vision,Image Models,MobileNetV2,2018,133
Computer Vision,...,...,...,...
Computer Vision,Image Models,Bottleneck Transformer,2021,1
Computer Vision,Image Models,SANet,2020,1
Computer Vision,Image Models,TNT,2021,1
Computer Vision,Image Models,SKNet,2000,0


In [34]:
len(image_models_df['method'].unique())

98

In [35]:
image_models_df[image_models_df['papers']==0]

Unnamed: 0_level_0,Unnamed: 1_level_0,method,year,papers
area,category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Computer Vision,Image Models,SKNet,2000,0
Computer Vision,Image Models,LR-Net,2000,0


In [36]:
image_models_df[image_models_df['method'] == 'VGG']

Unnamed: 0_level_0,Unnamed: 1_level_0,method,year,papers
area,category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Computer Vision,Image Models,VGG,2014,273


In [39]:
df.columns

Index(['method', 'year', 'papers', 'category', 'area'], dtype='object')

In [47]:
df_save1 = df[['area', 'category', 'method', 'year', 'papers']].reindex()
df_save1.to_csv('mathods.csv')
df2.to_csv('mathods_multiIdx.csv')

In [48]:
!ls

mathods.csv  mathods_multiIdx.csv  sample_data
