# Setting up


In [1]:
import pandas as pd

In [2]:
#upload csv of computer vision papers with tagged tasks
paper_df = pd.read_csv('data/cv_matched_tasks_methods.csv')

#make paper tasks lowercase
## improves our count of unique tasks
## our regexmatching ignores case anyway, though
paper_df["tasks"] = paper_df["tasks"].str.lower()

In [3]:
paper_df.head()

Unnamed: 0,cset_id,year,country,tasks,methods,text
0,carticle_0036993254,2016,China,{'no-reference quality assessment of enhanced ...,set(),No-reference quality assessment of enhanced im...
1,carticle_0169019068,2017,Australia,set(),{'convolutional neural networks'},Beyond filters: compact feature map for portab...
2,carticle_0082207762,2017,New Zealand,{'dimensionality reduction'},{'Semi-supervised double sparse graphs'},Semi-supervised double sparse graphs based dis...
3,carticle_0063791730,2018,Indonesia,"{'moving object in 3-dimensional space', 'desi...",{'Genetic Algorithm'},Designing an Optimization of Orientation Syste...
4,carticle_0076183469,2016,Taiwan,{'object detection'},{'primitive BG model-based'},Robust techniques for abandoned and removed ob...


In [4]:
paper_df.shape

(208867, 6)

# Identifying top tasks


In [5]:
def gen_row_tasks_all(paper_df):
    """generate the list of tasks for each row.
        row_tasks_all = list of lists of tasks
        task_counts = each paper's number of tasks
        unique_tasks = list of unique tasks
    """
    task_counts = []
    row_tasks_all = []
    task_list = []
    for row in range(len(paper_df)):
        row_tasks = eval(paper_df.iloc[row]['tasks'])        
        row_tasks_all.append(row_tasks)
        for task in row_tasks:
            task_list.append(task)
        n_tasks = len(row_tasks)
        task_counts.append(n_tasks)
    unique_tasks = list(set(task_list))
    return row_tasks_all, task_counts, unique_tasks 

In [6]:
row_tasks_all, task_counts, unique_tasks = gen_row_tasks_all(paper_df)

In [7]:
country_list = list(paper_df['country']) #list all countries which have at least one computer vision paper
n_papers = len(row_tasks_all)

In [8]:
#generate overall statistics -- what are the most popular tasks?

def gen_task_counts(row_tasks_all, unique_tasks):
    task_counts = {}
    for curr_tasks in row_tasks_all:
        for task in curr_tasks:
            try: task_counts[task] += 1
            except: task_counts[task] = 1
    return task_counts
    

In [9]:
task_counts = gen_task_counts(row_tasks_all, unique_tasks)

In [10]:
#make a dataframe with top tasks by paper count and percentage of CV papers they appear in
task_count_df = pd.DataFrame.from_dict(task_counts, orient='index', columns = ['count'])
task_count_df.insert(1, 'percentage', [ct*100./n_papers for ct in task_count_df['count']])

In [11]:
#list the top 10 tasks
task_count_df.sort_values('count', ascending=False)[:10]

Unnamed: 0,count,percentage
object detection,7282,3.486429
image classification,7262,3.476854
face recognition,5997,2.871205
denoising,4410,2.111391
image retrieval,3629,1.737469
object recognition,3480,1.666132
pose estimation,2984,1.42866
semantic segmentation,2829,1.35445
action recognition,2727,1.305616
super-resolution,2680,1.283113


# Tasks with the highest difference in US vs Chinese focus
We list tasks by the proportion of US and Chinese papers they appear in, and find the tasks with the largest difference.

In [12]:
def gen_country_task_counts(row_tasks_all, unique_tasks, country_list):
    """
    takes in: a list, each element of which is a list of the tasks belonging to a paper (row_tasks_all)
    a list of unique tasks
    a list of all countries
    Generates a dictionary, country_task_counts, whose keys are country strings: 
    For each country, country_task_counts[country] is a dictionary whose keys are tasks and whose values are the number of country papers in which that task occurs
    note: to allow intercountry comparison, we generate {task, 0} pairs for tasks that don't appear in any of a country's papers
    """
    
    country_task_counts = {}
    for country in set(country_list):
        country_task_counts[country] = {}
        for task in unique_tasks:
            country_task_counts[country][task] = 0
    
    for i in range(len(row_tasks_all)):
        curr_country = country_list[i]
        ct = row_tasks_all[i]
        for task in ct:
            try: country_task_counts[curr_country][task] += 1
            except: task_counts[curr_country][task] = 1
    return country_task_counts

In [13]:
country_task_counts = gen_country_task_counts(row_tasks_all, unique_tasks, country_list)

In [14]:
US_counts = country_task_counts["United States"]
CH_counts = country_task_counts["China"]

n_US = country_list.count("United States")
n_CH = country_list.count("China")

In [15]:
US_count_df = pd.DataFrame.from_dict(US_counts, orient='index', columns = ['count'])
US_count_df.insert(1, 'percent_country', [ct*100./n_US for ct in US_count_df['count']])

In [16]:
CH_count_df = pd.DataFrame.from_dict(CH_counts, orient='index', columns = ['count'])
CH_count_df.insert(1, 'percent_country', [ct*100./n_CH for ct in CH_count_df['count']])

In [17]:
#for each task, find the difference in "% of US papers that include the task" and "% of CH papers that include the task"
US_CH_pct_diffs = {}
for task in unique_tasks:
    US_pct = US_count_df.loc[task]['percent_country']
    CH_pct = CH_count_df.loc[task]['percent_country']
    US_minus_CH_pct = US_pct-CH_pct
    US_count = US_count_df.loc[task]['count']
    CH_count = CH_count_df.loc[task]['count']
    world_count = task_count_df.loc[task]['count']
    world_pct = task_count_df.loc[task]['percentage']
    
    US_CH_pct_diffs[task] = [US_minus_CH_pct, US_count, CH_count, world_count, US_pct, CH_pct, world_pct]

In [18]:
#generate dataframe of tasks with US-CH differences
US_CH_diffs_df = pd.DataFrame.from_dict(US_CH_pct_diffs, orient='index', columns = ['US-CH percent difference', 'US count', 'CH count', 'world count', 'US pct', 'CH pct', 'percent of world'])

In [19]:
#find the tasks with highest "US% - CH%"
US_CH_diffs_df.sort_values('US-CH percent difference', ascending=False)[:10]

Unnamed: 0,US-CH percent difference,US count,CH count,world count,US pct,CH pct,percent of world
object recognition,0.983376,775.0,597.0,3480.0,2.125326,1.14195,1.666132
computed tomography (ct),0.797431,454.0,234.0,1573.0,1.245029,0.447598,0.753111
semantic segmentation,0.58997,672.0,655.0,2829.0,1.842863,1.252893,1.35445
transfer learning,0.566128,581.0,537.0,2487.0,1.593309,1.027181,1.19071
pose estimation,0.538236,635.0,629.0,2984.0,1.741396,1.20316,1.42866
activity recognition,0.440378,268.0,154.0,1315.0,0.734951,0.294573,0.629587
question answering,0.34947,193.0,94.0,466.0,0.529275,0.179805,0.223108
decision making,0.3126,167.0,76.0,781.0,0.457973,0.145374,0.373922
domain adaptation,0.311156,299.0,266.0,1074.0,0.819964,0.508809,0.514203
scene understanding,0.306695,206.0,135.0,801.0,0.564925,0.25823,0.383498


In [20]:
#list the tasks with lowest "US% - CH%"
US_CH_diffs_df.sort_values('US-CH percent difference', ascending=True)[:10]

Unnamed: 0,US-CH percent difference,US count,CH count,world count,US pct,CH pct,percent of world
saliency detection,-1.114955,134.0,775.0,1256.0,0.367476,1.482431,0.60134
super-resolution,-1.029209,408.0,1123.0,2680.0,1.118881,2.14809,1.283113
image classification,-0.997594,1267.0,2338.0,7262.0,3.474565,4.472159,3.476854
visual tracking,-0.964223,235.0,841.0,1602.0,0.644454,1.608677,0.766995
denoising,-0.828967,654.0,1371.0,4410.0,1.793501,2.622468,2.111391
face recognition,-0.797951,892.0,1696.0,5997.0,2.446181,3.244132,2.871205
image super-resolution,-0.769477,170.0,646.0,1229.0,0.4662,1.235678,0.588413
object tracking,-0.737856,358.0,899.0,2597.0,0.981763,1.71962,1.243375
image denoising,-0.698342,235.0,702.0,1904.0,0.644454,1.342795,0.911585
person re-identification,-0.653147,218.0,654.0,1382.0,0.597834,1.25098,0.661665
