In [100]:
import pandas as pd
import numpy as np

import simplejson as json
import pickle

In [101]:
# Store a variable (after heavy computations) in a file
def save_pickle(result, file_path = 'pickle'):
    with open(file_path, 'wb') as file:
        pickle.dump(result, file)

In [102]:
# Load the variable using the file
def load_pickle(file_path):
    with open(file_path, 'rb') as file:
        return pickle.load(file)

<h1> Food Production : 2050
<h2> Data Collection/Cleaning</h2>

In this notebook we will take care of the data loading/cleaning and processing.
Indeed we will load the csv file and extract the interesting features, correct them and remove any missing values.
We will then process the dataframe into an algorithm able to process/compress the data efficiently by subsampling the data without any loss of information/precision.
We do not want to subsample randomly/orderly because it will lead to mean some points with a position really different (east side of USA and west side of Europe will give a meaning point position in the sea, which is not relevant).

We will be able to do the data collection/cleaning of the different models (by changing the file path and the features variables) :
- SSP1 --> 4 submodels
- SSP2 --> 4 submodels
- SSP3 --> 4 submodels
- SSP4 --> 4 submodels
- SSP5 --> 4 submodels

And for each model we'll be able to extract different features (calories, production, population etc...)

In [2]:
# Load the dataset
data_folder = 'Prelim_results_for_students/2050/'
ssp1cc = 'SSP1_cc.csv'
data = pd.read_csv(data_folder+'SSP1_cc.csv')



In [3]:
data.head()

Unnamed: 0,pixel_id,log(cal_per_ha),calories_per_ha,Calories,%cropland,Change_in_Yield,Change_in_Production,log(cal_per_ha) 2000,calories_per_ha 2000,Calories 2000,%cropland 2000,temp_avg,precip,workability_index,slope,altitude,lat,lon,population,ha_per_cell_5m
0,776595,14.691584,2401451.0,27677841.0,0.0,2401450.0,27677830.0,0.0,1.0,12.0,0.0,-111,288,6.0,-0.14773,106.0,-75.0,96.25,2147484000.0,2215.1946
1,780822,14.762812,2578741.0,5976685.0,0.0,2578740.0,5976683.0,0.0,1.0,2.0,0.0,-100,272,6.0,-0.08431,77.0,-74.92,88.5,2147484000.0,2227.2866
2,780823,14.706909,2438538.0,62169146.0,0.0,2438537.0,62169120.0,0.0,1.0,25.0,0.0,-101,273,6.0,-0.09272,90.0,-74.92,88.583333,2147484000.0,2227.2866
3,780827,14.899178,2955500.0,20549676.0,0.0,2955499.0,20549670.0,0.0,1.0,7.0,0.0,-102,272,6.0,-0.87669,91.0,-74.92,88.916667,2147484000.0,2227.2866
4,780915,14.761227,2574656.0,41770535.0,0.0,2574655.0,41770520.0,0.0,1.0,16.0,0.0,-110,286,6.0,-0.36656,88.0,-74.92,96.25,2147484000.0,2227.2866


In [4]:
calories = data[['Calories', 'lat','lon']]

In [5]:
calories.head()

Unnamed: 0,Calories,lat,lon
0,27677841.0,-75.0,96.25
1,5976685.0,-74.92,88.5
2,62169146.0,-74.92,88.583333
3,20549676.0,-74.92,88.916667
4,41770535.0,-74.92,96.25


In [6]:
calories = calories.dropna()

In [7]:
# Reduce Calories values
cal_series = calories['Calories'].apply(lambda x:int(np.log(x)*1000))

In [8]:
calories['Calories'] = cal_series

In [9]:
calories.describe()

Unnamed: 0,Calories,lat,lon
count,1064557.0,1064557.0,1064557.0
mean,21359.14,-23.75191,26.28324
std,2305.517,28.40238,74.75068
min,8637.0,-75.0,-160.9167
25%,19724.0,-47.42,-43.66667
50%,21783.0,-31.08,31.83333
75%,23121.0,-1.25,86.16667
max,27460.0,55.25,179.8333


In [10]:
calories = calories.round(3)

In [115]:
# Correct organization of data
new_calories = pd.DataFrame(columns=['Calories','lon', 'lat'])

new_calories['Calories'] = calories['Calories']
new_calories['lon'] = calories['lon']
# Minus sign (because inverted in the original dataset)
new_calories['lat'] = -calories['lat']

In [153]:
new_calories.head()

Unnamed: 0,Calories,lon,lat
0,17136,96.25,75.0
1,15603,88.5,74.92
2,17945,88.583,74.92
3,16838,88.917,74.92
4,17547,96.25,74.92


In [272]:
# example to get all indices corresponding to the latitude value
# used in next steps
testind = (np.where(new_calories['lat']==73.42))[0]
testind

In [162]:
test_df = new_calories.iloc[testind].head()
test_df

In [290]:
# All different values of lat
vec_lat = new_calories['lat'].unique()

In [291]:
# Init the Dataframe 
reduced_calories = pd.DataFrame(columns=['Calories','lon', 'lat'])

In [279]:
# This code will compress the data by taking the mean of 5 (sample) points having the same latitude 
# and a close longitude distance = 1 (distance)

sample = 5
distance = 1
iterat = 0

# Iterate through all the different 'lat' values
for lat in vec_lat:
    lat_ind = np.where(new_calories['lat']==lat)[0]
    temp_df_lat = new_calories.iloc[lat_ind]
    
    # index will iterate from 0 to 'sample' to create a mean of the values
    index = 0
    
    # For each group in samples using mean of near distance
    for i in range(temp_df_lat.shape[0]):
        
        temp_cal = temp_df_lat.iloc[i]['Calories']
        temp_lon = temp_df_lat.iloc[i]['lon']
        temp_lat = temp_df_lat.iloc[i]['lat']
        
        # First value of the subsample
        if(index == 0):
            # the 'prev_X' values will be tested with the next point to check the longitude distance
            prev_cal = temp_cal
            prev_lon = temp_lon
            prev_lat = temp_lat
            mean_df = pd.DataFrame([[temp_cal, temp_lon,temp_lat]], columns = ['Calories','lon', 'lat'])
            index =index +1
        
        else:
            # Test if the longitude distance between the points is less than 'distance'
            if( (max(temp_lon,prev_lon) - min(temp_lon,prev_lon) <= distance) ):  
                if(index < sample):
                    temp_df = pd.DataFrame([[temp_cal, temp_lon,temp_lat]], columns = ['Calories','lon', 'lat'])
                    mean_df = mean_df.append(temp_df)
                    index = index + 1
                    
                elif(index == sample):
                    # Add the last value of the subsample
                    temp_df = pd.DataFrame([[temp_cal, temp_lon,temp_lat]], columns = ['Calories','lon', 'lat'])
                    mean_df = mean_df.append(temp_df)
                    
                    # Compute the mean for the 5 ('sample') subsamples 
                    temp_mean = mean_df.mean()
                    mean_cal = temp_mean[0]
                    mean_long = temp_mean[1]
                    mean_lat = temp_mean[2]
                    
                    # Append the values to the dataframe
                    temp_df2 = pd.DataFrame([[mean_cal, mean_long,mean_lat]], columns = ['Calories','lon', 'lat'])
                    reduced_calories = reduced_calories.append(temp_df2, ignore_index=True)
                    index = 0
                
            # The point is too far from the previous point : create a new subsample and store the previous one
            else:
                # Compute the mean for the 5 ('sample') subsamples 
                temp_mean = mean_df.mean()
                mean_cal = temp_mean[0]
                mean_long = temp_mean[1]
                mean_lat = temp_mean[2]
                
                # Append the values to the dataframe
                temp_df2 = pd.DataFrame([[mean_cal, mean_long,mean_lat]], columns = ['Calories','lon', 'lat'])
                reduced_calories = reduced_calories.append(temp_df2, ignore_index=True)

                # Start new subsample
                prev_cal = temp_cal
                prev_lat = temp_lat
                prev_lon = temp_lon
                mean_df = pd.DataFrame([[temp_cal, temp_lon,temp_lat]], columns = ['Calories','lon', 'lat'])
                index = 1
                
        if(i == temp_df_lat.shape[0]-1):
            # If final point of the dataframe, store it
            temp_mean = mean_df.mean()
            mean_cal = temp_mean[0]
            mean_long = temp_mean[1]
            mean_lat = temp_mean[2]
            
            temp_df2 = pd.DataFrame([[mean_cal, mean_long,mean_lat]], columns = ['Calories','lon', 'lat'])
            reduced_calories = reduced_calories.append(temp_df2, ignore_index=True)
                
    # Observe the progression
    iterat = iterat +1
    print("Iteration %d / %d" % (iterat, len(vec_lat)))
            

Iteration 1 / 1564
Iteration 2 / 1564
Iteration 3 / 1564
Iteration 4 / 1564
Iteration 5 / 1564
Iteration 6 / 1564
Iteration 7 / 1564
Iteration 8 / 1564
Iteration 9 / 1564
Iteration 10 / 1564
Iteration 11 / 1564
Iteration 12 / 1564
Iteration 13 / 1564
Iteration 14 / 1564
Iteration 15 / 1564
Iteration 16 / 1564
Iteration 17 / 1564
Iteration 18 / 1564
Iteration 19 / 1564
Iteration 20 / 1564
Iteration 21 / 1564
Iteration 22 / 1564
Iteration 23 / 1564
Iteration 24 / 1564
Iteration 25 / 1564
Iteration 26 / 1564
Iteration 27 / 1564
Iteration 28 / 1564
Iteration 29 / 1564
Iteration 30 / 1564
Iteration 31 / 1564
Iteration 32 / 1564
Iteration 33 / 1564
Iteration 34 / 1564
Iteration 35 / 1564
Iteration 36 / 1564
Iteration 37 / 1564
Iteration 38 / 1564
Iteration 39 / 1564
Iteration 40 / 1564
Iteration 41 / 1564
Iteration 42 / 1564
Iteration 43 / 1564
Iteration 44 / 1564
Iteration 45 / 1564
Iteration 46 / 1564
Iteration 47 / 1564
Iteration 48 / 1564
Iteration 49 / 1564
Iteration 50 / 1564
Iteration

Iteration 397 / 1564
Iteration 398 / 1564
Iteration 399 / 1564
Iteration 400 / 1564
Iteration 401 / 1564
Iteration 402 / 1564
Iteration 403 / 1564
Iteration 404 / 1564
Iteration 405 / 1564
Iteration 406 / 1564
Iteration 407 / 1564
Iteration 408 / 1564
Iteration 409 / 1564
Iteration 410 / 1564
Iteration 411 / 1564
Iteration 412 / 1564
Iteration 413 / 1564
Iteration 414 / 1564
Iteration 415 / 1564
Iteration 416 / 1564
Iteration 417 / 1564
Iteration 418 / 1564
Iteration 419 / 1564
Iteration 420 / 1564
Iteration 421 / 1564
Iteration 422 / 1564
Iteration 423 / 1564
Iteration 424 / 1564
Iteration 425 / 1564
Iteration 426 / 1564
Iteration 427 / 1564
Iteration 428 / 1564
Iteration 429 / 1564
Iteration 430 / 1564
Iteration 431 / 1564
Iteration 432 / 1564
Iteration 433 / 1564
Iteration 434 / 1564
Iteration 435 / 1564
Iteration 436 / 1564
Iteration 437 / 1564
Iteration 438 / 1564
Iteration 439 / 1564
Iteration 440 / 1564
Iteration 441 / 1564
Iteration 442 / 1564
Iteration 443 / 1564
Iteration 444

Iteration 788 / 1564
Iteration 789 / 1564
Iteration 790 / 1564
Iteration 791 / 1564
Iteration 792 / 1564
Iteration 793 / 1564
Iteration 794 / 1564
Iteration 795 / 1564
Iteration 796 / 1564
Iteration 797 / 1564
Iteration 798 / 1564
Iteration 799 / 1564
Iteration 800 / 1564
Iteration 801 / 1564
Iteration 802 / 1564
Iteration 803 / 1564
Iteration 804 / 1564
Iteration 805 / 1564
Iteration 806 / 1564
Iteration 807 / 1564
Iteration 808 / 1564
Iteration 809 / 1564
Iteration 810 / 1564
Iteration 811 / 1564
Iteration 812 / 1564
Iteration 813 / 1564
Iteration 814 / 1564
Iteration 815 / 1564
Iteration 816 / 1564
Iteration 817 / 1564
Iteration 818 / 1564
Iteration 819 / 1564
Iteration 820 / 1564
Iteration 821 / 1564
Iteration 822 / 1564
Iteration 823 / 1564
Iteration 824 / 1564
Iteration 825 / 1564
Iteration 826 / 1564
Iteration 827 / 1564
Iteration 828 / 1564
Iteration 829 / 1564
Iteration 830 / 1564
Iteration 831 / 1564
Iteration 832 / 1564
Iteration 833 / 1564
Iteration 834 / 1564
Iteration 835

Iteration 1171 / 1564
Iteration 1172 / 1564
Iteration 1173 / 1564
Iteration 1174 / 1564
Iteration 1175 / 1564
Iteration 1176 / 1564
Iteration 1177 / 1564
Iteration 1178 / 1564
Iteration 1179 / 1564
Iteration 1180 / 1564
Iteration 1181 / 1564
Iteration 1182 / 1564
Iteration 1183 / 1564
Iteration 1184 / 1564
Iteration 1185 / 1564
Iteration 1186 / 1564
Iteration 1187 / 1564
Iteration 1188 / 1564
Iteration 1189 / 1564
Iteration 1190 / 1564
Iteration 1191 / 1564
Iteration 1192 / 1564
Iteration 1193 / 1564
Iteration 1194 / 1564
Iteration 1195 / 1564
Iteration 1196 / 1564
Iteration 1197 / 1564
Iteration 1198 / 1564
Iteration 1199 / 1564
Iteration 1200 / 1564
Iteration 1201 / 1564
Iteration 1202 / 1564
Iteration 1203 / 1564
Iteration 1204 / 1564
Iteration 1205 / 1564
Iteration 1206 / 1564
Iteration 1207 / 1564
Iteration 1208 / 1564
Iteration 1209 / 1564
Iteration 1210 / 1564
Iteration 1211 / 1564
Iteration 1212 / 1564
Iteration 1213 / 1564
Iteration 1214 / 1564
Iteration 1215 / 1564
Iteration 

Iteration 1545 / 1564
Iteration 1546 / 1564
Iteration 1547 / 1564
Iteration 1548 / 1564
Iteration 1549 / 1564
Iteration 1550 / 1564
Iteration 1551 / 1564
Iteration 1552 / 1564
Iteration 1553 / 1564
Iteration 1554 / 1564
Iteration 1555 / 1564
Iteration 1556 / 1564
Iteration 1557 / 1564
Iteration 1558 / 1564
Iteration 1559 / 1564
Iteration 1560 / 1564
Iteration 1561 / 1564
Iteration 1562 / 1564
Iteration 1563 / 1564
Iteration 1564 / 1564


In [280]:
reduced_calories.shape

(193237, 3)

In [281]:
# Save the data to pickle variable
save_pickle(reduced_calories, file_path = 'reduced_calories.pickle')

In [292]:
calories.shape

(1064557, 3)

In [106]:
test = load_pickle('reduced_calories.pickle')

In [283]:
# Write to json file 
reduced_calories.to_json('calories_filtered.json', orient='records')

In [289]:
# Transform the json file into geojson file

in_file = 'calories_filtered.json'
out_file = 'geo_calories_filtered.geojson'


data = json.load(open(in_file))

geojson = {
    "type": "FeatureCollection",
    "features": [
    {
        "type": "Feature",
        "geometry" : {
            "type": "Point",
            "coordinates": [d["lon"], d["lat"]],
            },
        "properties" : {"calories": d["Calories"] },
     } for d in data]
}


output = open(out_file, 'w')
json.dump(geojson, output)