# Initialisation 

In [1]:
import pandas as pd
import numpy as np

In [2]:
dataA_exceptions = [525, 540]
dataA0 = []

for i in range(510, 895):
    if i in dataA_exceptions: continue
    dataA0.append(pd.read_csv(f"../newyorker_caption_contest_virgin/data/{i}.csv"))

dataA1 = dataA0.copy()
dataA1[0].head(5)

Unnamed: 0,rank,caption,mean,precision,votes,not_funny,somewhat_funny,funny
0,0,I'm a congressman--obstruction is my job.,1.913043,0.094022,69,24,27,18
1,1,"I'm what they mean when they say, 'The middle ...",1.842105,0.191381,19,8,6,5
2,2,Does this suit make me look flat?,1.711111,0.112915,45,21,16,8
3,3,"When the right woman comes along, I'll know it.",1.625,0.116657,32,15,14,3
4,4,"I used to lie in the gutter, but then I quit d...",1.617647,0.13361,34,19,9,6


In [3]:
dataC0= pd.read_json('../newyorker_caption_contest_virgin/contests.json') 
dataC1 = dataC0.copy()
dataC1.head()

Unnamed: 0,contest_id,image,data,metadata
0,510,images/510.jpg,data/510.csv,"{'num_captions': 3905, 'num_votes': 41185, 'im..."
1,511,images/511.jpg,data/511.csv,"{'num_captions': 3325, 'num_votes': 28205, 'im..."
2,512,images/512.jpg,data/512.csv,"{'num_captions': 4399, 'num_votes': 21574, 'im..."
3,513,images/513.jpg,data/513.csv,"{'num_captions': 4141, 'num_votes': 16894, 'im..."
4,514,images/514.jpg,data/514.csv,"{'num_captions': 3951, 'num_votes': 95790, 'im..."


In [4]:

dataC_metadata = pd.json_normalize(dataC1.metadata)
dataC = dataC1.drop(columns=['metadata'])
dataC = pd.concat([dataC, dataC_metadata], axis=1)
dataC.head()

Unnamed: 0,contest_id,image,data,num_captions,num_votes,image_locations,image_descriptions,image_uncanny_descriptions,entities,questions
0,510,images/510.jpg,data/510.csv,3905.0,41185.0,[the street],[A man is relaxing on a city street. Others ar...,[A man is just laying in the middle of the sid...,[https://en.wikipedia.org/wiki/Bystander_effec...,[Why is he laying there?]
1,511,images/511.jpg,data/511.csv,3325.0,28205.0,"[the front hard, a residential walkway]",[A man in a winter coat and cap is looking at ...,[It's unusual to see someone holding a snow sh...,"[https://en.wikipedia.org/wiki/Snowball_fight,...",[Is the man overly small or the shovel overly ...
2,512,images/512.jpg,data/512.csv,4399.0,21574.0,"[yoga place, a yoga studio]",[A man and woman are standing facing one anoth...,[Nothing is really out of place in this image....,"[https://en.wikipedia.org/wiki/Rug, https://en...","[Why is the man carrying a huge rug?, Why is t..."
3,513,images/513.jpg,data/513.csv,4141.0,16894.0,"[a workplace, an elevator]",[Three business men are walking down a hall. T...,[A suit case is usually carried by one person ...,[https://en.wikipedia.org/wiki/Worker_cooperat...,[Why is the briefcase big enough for three peo...
4,514,images/514.jpg,data/514.csv,3951.0,95790.0,[plains],[Some cowboys are riding through the desert. T...,[There are rocking horses in place of real hor...,"[https://en.wikipedia.org/wiki/Rocking_horse, ...",[Why is this chase taking place?]


# Other

### Remove redundant columns (index and rank)

We could import directly with rank index:
```python
pd.read_csv(f"../newyorker_caption_contest_virgin/data/{i}.csv", index_col=['rank'])
```
but since not all files have column rank it makes sense to import as it is and later remove the redundant column. 

```python
data.reset_index(drop=True)
```
**reset_index** 
- reindex the rows by making a new index column
- make the previous index column a normal one label column "index"

**drop=True** 
- removes the previous index column. 

```python
data.set_index('rank')
```
**set_index('rank')**
- set the column 'rank' as index column
- only of column rank realy exists




In [5]:
dataA2 = []

for i, data in enumerate(dataA1):

    if 'rank' in data.columns: 
        if (data.index == data['rank']).all(): 
            data = data.reset_index(drop=True)
            data = data.set_index('rank')
            dataA2.append(data)
        else: 
            print("WHF???")

    else:
        data = data.sort_values('mean', ascending=False)
        data = data.reset_index(drop=True)
        data.index.name = 'rank'
        dataA2.append(data)

# Test if no dataFrame was lost
#if (len(dataA2) == len(dataA1)): print("Success")

dataA2[2]


Unnamed: 0_level_0,caption,mean,precision,votes,not_funny,somewhat_funny,funny
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,"We're pretentious, not ostentatious.",2.250000,0.168005,32,11,2,19
1,Broga meets on the 2nd floor.,2.000000,0.241523,16,7,2,7
2,Please put that back in reception.,1.900000,0.179505,10,2,7,1
3,"I told you to bring your own, that's from our ...",1.857143,0.177192,14,4,8,2
4,"Let her out, Fred.",1.846154,0.249259,13,6,3,4
...,...,...,...,...,...,...,...
4394,Is she in your mat practicing the corpse pose?,1.000000,0.408248,3,3,0,0
4395,beginning yoga is room five.,1.000000,0.408248,3,3,0,0
4396,You're supposed to leave your shoes and your e...,1.000000,0.288675,4,4,0,0
4397,You can stretch the full length?,1.000000,0.223607,5,5,0,0


### Consistency verification 

Since data not contain some values, we are searching the NaN and replacing. 

```python
data.isnull().values.any(): 
```
- return true if there is any value that is null from data 


```python
dataA3[i].fillna('CAPTION_NOT_FOUND', inplace = True):
```
- For dataframe i fill ALL na values with 'text'

In [6]:
dataA3 = dataA2.copy()

for i, data in enumerate(dataA2):
    if data.isnull().values.any(): 
        print(data.isnull().values.any(), i)
        dataA3[i].fillna('CAPTION_NOT_FOUND', inplace = True)

# Verify if there are realy no more NaN

error = False
for data in dataA3: 
    error = data.isnull().values.any()
print("Error is: ", error)



True 149
True 157
True 158
True 159
True 163
True 179
True 180
True 181
True 182
True 183
True 184
True 186
True 190
True 192
True 195
True 197
True 199
True 200
True 201
True 203
True 204
True 209
True 219
True 236
True 240
True 248
True 257
True 259
True 260
True 270
True 281
True 302
True 303
True 306
True 307
True 314
True 316
True 323
True 337
True 351
True 355
True 359
True 362
True 366
True 367
True 379
Error is:  False
