In [1]:
import pandas as pd

In [2]:
# To read the csv data into Jupyter Notebooks
entrants_10_12 = pd.read_csv('number_of_entries_2010_2012.csv', header = None)
entrants_10_12.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Afghanistan,39446,27424,66870,9627,10029,19656,5519,6937,12456
1,Albania,53104,65809,118913,40823,46366,87189,55686,59671,115357
2,Algeria,68052,22652,90704,66404,21600,88004,85568,26513,112081
3,Andorra,276,55,331,79,26,105,129,39,168
4,Angola,883,934,1817,1039,1291,2330,1391,1664,3055


![alt text](DV_table.png "DV_table")

Since the information we need is total number of entries for each year, they can be found at column 3,6 and 9.  We will drop the other columns since we will not use them.

In [3]:
# Read the csv data into Jupyter Notebooks
# Drop the column that we don't need
# Change the column names with years
entrants_10_12 = pd.read_csv('number_of_entries_2010_2012.csv', header = None)
entrants_10_12 = entrants_10_12.drop(entrants_10_12.columns[[1,2,4,5,7,8]], axis =1)
entrants_10_12.columns = ['Country', '2010', '2011', '2012']

entrants_13_15 = pd.read_csv('number_of_entries_2013_2015.csv' , header = None)
entrants_13_15 = entrants_13_15.drop(entrants_13_15.columns[[1,2,4,5,7,8]], axis =1)
entrants_13_15.columns = ['Country', '2013', '2014', '2015']

entrants_16_18 = pd.read_csv('number_of_entries_2016_2018.csv' , header = None)
entrants_16_18 = entrants_16_18.drop(entrants_16_18.columns[[1,2,4,5,7,8]], axis =1)
entrants_16_18.columns = ['Country', '2016', '2017', '2018']

In [4]:
# Combine all the tables to create one from 2010 to 2018
combined_1 = entrants_10_12.merge(entrants_13_15, how = 'left', on = 'Country')
combined = combined_1.merge(entrants_16_18, how = 'left', on = 'Country')
combined

Unnamed: 0,Country,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,Afghanistan,66870,19656,12456,16277,31485,27789,65951,83277,122755
1,Albania,118913,87189,115357,122567,171362,198625,246129,315842,367231
2,Algeria,90704,88004,112081,128189,127872,163091,204433,252733,342857
3,Andorra,331,105,168,147,144,134,186,166,175
4,Angola,1817,2330,3055,3578,5884,9198,11546,14859,22819
5,Antigua and Barbuda,826,695,714,728,755,648,637,641,525
6,Argentina,9821,12025,10936,10246,12853,12154,17727,18309,19811
7,Armenia,62051,72876,92352,97679,113681,131088,168115,226545,267067
8,Aruba,1399,604,717,629,665,556,603,596,611
9,Australia,12772,15509,17163,18117,21621,20579,24888,26704,26353


In [5]:
# Normalize text
import re
def normalize_text(text):
    text = str(text)
    text = re.sub('[^A-Za-z0-9\s]', ' ', text)
    return text

def title_text(text):
    text = text.title()
    text = text.rstrip()
    return text

def normalize_value(text):
    text = re.sub('\s', '', text)
    try:
        value = int(text)
    except:
        value = 0
    return value

In [6]:
combined.iloc[:, 1:] = combined.iloc[:, 1:].applymap(normalize_text)
combined.iloc[:, 1:] = combined.iloc[:, 1:].applymap(normalize_value)
combined.iloc[:, 0] = combined.iloc[:, 0].apply(title_text)

In [7]:
# Calculate numbers of application difference between 2010 and 2018
combined['8_years_diff'] = combined['2018'] - combined['2010']

# Calculate the percentage of increase/decrease from 2010 to 2018 in the number of applications 
combined['%_increase'] = (combined[combined['2018'] > 0]['2018'] / combined[combined['2010'] > 0]['2010'] *100) - 100

# Drop null values
combined = combined.fillna(0)

# Calculate the average number of applications for every country
combined['average'] = round(combined.iloc[:,1:10].mean(axis = 1))
combined = combined.sort_values(['average'], ascending = False)
DV_num_of_entries = combined


In [8]:
DV_num_of_entries.to_csv('DV_num_of_entries.csv')

In [9]:
DV_num_of_entries.iloc[1:,:].sort_values('average', ascending = False)

Unnamed: 0,Country,2010,2011,2012,2013,2014,2015,2016,2017,2018,8_years_diff,%_increase,average
14,Bangladesh,834324,6497926,8562251,0,0,0,0,0,0,-834324,0.000000,1766056.0
69,Ghana,523448,609924,774557,1056032,1217888,1729979,2231745,2199021,2227530,1704082,325.549434,1396680.0
198,Ukraine,927470,1080091,1122086,1232306,1255129,1274758,1291999,1470250,1450487,523017,56.391797,1233842.0
134,Nigeria,1570316,2144626,2005876,1975571,2390758,0,0,0,0,-1570316,0.000000,1120794.0
201,Uzbekistan,194033,295828,507361,654327,1193657,1387420,1488984,1576179,2114446,1920413,989.735251,1045804.0
83,Iran,470279,426505,547735,692410,1037354,932346,1203531,1390853,1624204,1153925,245.370301,925024.0
53,Egypt,642321,534375,780728,960279,847230,763701,849681,914817,1274751,632430,98.460116,840876.0
57,Ethiopia,745372,662658,785318,775763,753701,845474,780829,844959,1056532,311160,41.745598,805623.0
127,Nepal,356216,322758,397036,496410,613385,892961,1090880,1136622,1187350,831134,233.323040,721513.0
165,Sierra Leone,326452,441250,315741,233519,284205,511120,611983,841688,1011725,685273,209.915393,508631.0


It seems like when we sort the countries by their average number of applications, first 5 countries have average of more than 1 million lottery application. And it seems like the U.S. stopped taking application from Bangladesh and Nigeria at some point. It may be because of the country quota that the U.S. had set. We can analyze it as number of application to population ratio in further steps.

If we look at the most bottom row, we see that the U.S. did not take any application from Brazil between the years of 2010 and 2018. 

After this point we will continue with analyzing the first 20 countries with the most average number of applications.

In [10]:
DV_num_of_entries.iloc[1:20].sort_values('average', ascending = False)

Unnamed: 0,Country,2010,2011,2012,2013,2014,2015,2016,2017,2018,8_years_diff,%_increase,average
14,Bangladesh,834324,6497926,8562251,0,0,0,0,0,0,-834324,0.0,1766056.0
69,Ghana,523448,609924,774557,1056032,1217888,1729979,2231745,2199021,2227530,1704082,325.549434,1396680.0
198,Ukraine,927470,1080091,1122086,1232306,1255129,1274758,1291999,1470250,1450487,523017,56.391797,1233842.0
134,Nigeria,1570316,2144626,2005876,1975571,2390758,0,0,0,0,-1570316,0.0,1120794.0
201,Uzbekistan,194033,295828,507361,654327,1193657,1387420,1488984,1576179,2114446,1920413,989.735251,1045804.0
83,Iran,470279,426505,547735,692410,1037354,932346,1203531,1390853,1624204,1153925,245.370301,925024.0
53,Egypt,642321,534375,780728,960279,847230,763701,849681,914817,1274751,632430,98.460116,840876.0
57,Ethiopia,745372,662658,785318,775763,753701,845474,780829,844959,1056532,311160,41.745598,805623.0
127,Nepal,356216,322758,397036,496410,613385,892961,1090880,1136622,1187350,831134,233.32304,721513.0
165,Sierra Leone,326452,441250,315741,233519,284205,511120,611983,841688,1011725,685273,209.915393,508631.0
