## Religion
### Association of Religion Data Archives - National Religion Dataset

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

This is a time series dataset. It contains no missing values.

In [2]:
religion_df = pd.read_csv('../datasets/processed/religion/world-religion-project/national-religion-dataset.csv', header=0)
print(f"Records: {len(religion_df)}")

print(religion_df.info())

religion_df.describe()


Records: 1934
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1934 entries, 0 to 1933
Data columns (total 87 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   year      1934 non-null   int64  
 1   cowcode   1934 non-null   int64  
 2   ardacode  1934 non-null   int64  
 3   iso3      1934 non-null   object 
 4   numiso    1934 non-null   int64  
 5   country   1934 non-null   object 
 6   chrsprot  1934 non-null   int64  
 7   chrscat   1934 non-null   int64  
 8   chrsorth  1934 non-null   int64  
 9   chrsang   1934 non-null   int64  
 10  chrsothr  1934 non-null   int64  
 11  chrsgen   1934 non-null   int64  
 12  judorth   1934 non-null   int64  
 13  jdcons    1934 non-null   int64  
 14  judref    1934 non-null   int64  
 15  judothr   1934 non-null   int64  
 16  judgen    1934 non-null   int64  
 17  islmsun   1934 non-null   int64  
 18  islmshi   1934 non-null   int64  
 19  islmibd   1934 non-null   int64  
 20  islmnat   1934 n

Unnamed: 0,year,cowcode,ardacode,numiso,chrsprot,chrscat,chrsorth,chrsang,chrsothr,chrsgen,...,otgenpct,sumpct,total,dualrel,datatype,sreliab,rreliab,reliab,sourcecd,version
count,1934.0,1934.0,1934.0,1934.0,1934.0,1934.0,1934.0,1934.0,1934.0,1934.0,...,1934.0,1934.0,1934.0,1934.0,1934.0,1934.0,1934.0,1934.0,1934.0,1934.0
mean,1983.655636,452.140641,125.040331,425.181489,2060242.0,5199000.0,1098717.0,384875.6,721921.5,9464755.0,...,0.016302,0.99574,1.012043,0.02999,67.025853,14.093071,17.362461,1.883661,53.434333,1.1
std,18.769777,259.648782,69.348624,249.586747,9377198.0,13717350.0,6649538.0,2463556.0,2525904.0,22621280.0,...,0.031006,0.090529,0.082711,0.170603,91.793124,27.447012,9.823084,0.595214,90.258585,2.22102e-16
min,1945.0,2.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.7193,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.1
25%,1970.0,220.0,66.0,208.0,9792.5,44142.5,0.0,0.0,0.0,245719.5,...,0.001,0.983225,1.0,0.0,24.0,4.0,10.0,2.0,7.0,1.1
50%,1985.0,450.0,124.0,414.0,118118.0,421240.0,418.0,147.5,17498.0,1927634.0,...,0.004299,0.9957,1.0,0.0,24.0,5.0,20.0,2.0,13.0,1.1
75%,2000.0,663.0,183.0,642.0,990201.2,3643860.0,40764.5,12000.0,303189.8,7570553.0,...,0.016774,0.999,1.0,0.0,123.0,8.0,24.0,2.0,83.0,1.1
max,2010.0,990.0,249.0,894.0,132040200.0,127311500.0,102853800.0,29705800.0,39683470.0,233111300.0,...,0.2807,1.7277,1.7277,1.0,1234.0,99.0,35.0,3.0,751.0,1.1


In [3]:
religion_df.head()

Unnamed: 0,year,cowcode,ardacode,iso3,numiso,country,chrsprot,chrscat,chrsorth,chrsang,...,otgenpct,sumpct,total,dualrel,datatype,sreliab,rreliab,reliab,sourcecd,version
0,1945,2,234,USA,840,United States of America,66069671,38716742,1121898,2400000,...,0.003899,0.9961,1.0,0,34,2,10,2,13,1.1
1,1950,2,234,USA,840,United States of America,73090083,42635882,3045420,3045420,...,0.0041,0.9959,1.0,0,34,6,28,1,18,1.1
2,1955,2,234,USA,840,United States of America,79294628,46402368,3454916,2572767,...,0.019299,0.9807,1.0,0,134,5,10,2,15,1.1
3,1960,2,234,USA,840,United States of America,90692928,50587880,3334535,2710065,...,0.007599,0.9924,1.0,0,134,2,10,2,13,1.1
4,1965,2,234,USA,840,United States of America,94165803,64761783,4792868,2822149,...,0.003,0.997,1.0,0,134,8,28,1,20,1.1


For each country I'll take the major religion and create a boolean predictor for it.

In [10]:
religions_columns = ['chprtpct', 'chcatpct', 'chortpct', 'changpct', 'chothpct', 'jdorpct', 'jdcnpct', 'jdrfpct', 'jdotpct', 'issunpct', 'isshipct', 'isibdpct', 'isnatpct', 'isalapct', 'isahmpct', 'islotpct', 'bumahpct', 'buthrpct', 'buothpct', 'zogenpct', 'higenpct', 'sigenpct', 'shgenpct', 'bagenpct', 'tagenpct', 'jagenpct', 'cogenpct', 'sygenpct', 'angenpct', 'norelpct', 'otgenpct']
religion_df['dominant_religion'] = religion_df[religions_columns].idxmax(axis=1)

# for religion in religions_columns:
#     maj_column_name = f'maj_{religion}'
#     rdi_df[maj_column_name] = (rdi_df['dominant_religion'] == religion).astype(int)


religion_df[['year', 'country', 'dominant_religion']][:20]





Unnamed: 0,year,country,dominant_religion
0,1945,United States of America,chprtpct
1,1950,United States of America,chprtpct
2,1955,United States of America,chprtpct
3,1960,United States of America,chprtpct
4,1965,United States of America,chprtpct
5,1970,United States of America,chprtpct
6,1975,United States of America,chprtpct
7,1980,United States of America,chprtpct
8,1985,United States of America,chprtpct
9,1990,United States of America,chprtpct


This dataset is less reliable than the one from the Pew Research Center but give more detailed information about religions in case that it's needed.



I'll take a snapshot of the last year and create a new dataset.

In [11]:
religion_df['year'].value_counts().sort_values(ascending=False)

year
2010    192
2005    190
2000    189
1995    185
1990    158
1985    153
1980    150
1975    143
1970    132
1965    120
1960    106
1955     81
1950     73
1945     62
Name: count, dtype: int64

In [13]:
religion_df_snapshot = religion_df[religion_df['year'] == 2010]
religion_df_snapshot.drop(columns=['year'], inplace=True)
religion_df_snapshot.to_csv('../datasets/processed/religion/world-religion-project/national-religion-dataset-snapshot.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  religion_df_snapshot.drop(columns=['year'], inplace=True)
