In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import json

# Read Data

In [46]:
metadata = pd.read_csv('./data/metadata.csv')
all_india = metadata[metadata['Division']=='India']

for i,row in all_india.iterrows():
    all_india.loc[i, ["Local_lat"]] = float(row["Local_lat"])
    
north_indian_states = {
       'Chhattisgarh',
       'Maharashtra', 'Malwa',
       'Chota Nagpur Plateau',
       'Bihar/ Uttar Pradesh', 'Uttar Pradesh', 'Varanasi, Uttar Pradesh',
       'Rajasthan', 'Gujarat',
       'Gujarat/ Madhya Pradesh/ Maharashtra/ Rajasthan'
}

north_india = pd.DataFrame()
south_india = pd.DataFrame()

for i,row in all_india.iterrows():
    if row["Area/Kingdom"] in north_indian_states:
        north_india = north_india.append(all_india.loc[i])
    else:
        south_india = south_india.append(row)
        
north_india = north_india.rename(columns={'C-id':'canto_coding_id'})
south_india = south_india.rename(columns={'C-id':'canto_coding_id'})

north_india.to_csv('./data/north-india.csv')
south_india.to_csv('./data/south-india.csv')

In [47]:
classfication_metadata = pd.read_csv('./data/classification_metadata.csv',encoding='latin1')
classfication_metadata[classfication_metadata['Culture']=='Dhurwa Gond']['Lang_Family'].values[0]
language_map = {
    'Dravidian, Central Dravidian':'Dravidian',
    'Dravidian, South Dravidian, Gondi, Koya': 'Dravidian',
    'Dravidian, South Dravidian, Gondi' :'Dravidian',
    'Dravidian, South Dravidian': 'Dravidian',
    'Dravidian, North Dravidian, Kurux-Malto, Kurux':'Dravidian',
    'Indo-European, Indo-Iranian, Indo-Aryan, Western Hindi, Hindustani': 'Indo-European',
    'Austroasiatic, Mundaic, North Munda': 'Austroasiatic',
    'Austroasiatic, Mundaic, Mundaric, Ho-Mundari':'Austroasiatic',
    'Austroasiatic, Mundaic, North Munda, Mundaric':'Austroasiatic',
    'Austroasiatic, Mundaic, North Munda, Kherwarian':'Austroasiatic',
    'Indo-European, Indo-Iranian, Indo-Aryan, Bihari, Western Magadhan, Bhojpuric':'Indo-European',
    'Indo-European, Indo-Iranian, Indo-Aryan, Gujarati-Rajasthani':'Indo-European',
    'Indo-European, Indo-Iranian, Indo-Aryan, Gujarati-Rajasthani, Gujaratic':'Indo-European',
    'Indo-European, Indo-Iranian, Indo-Aryan, Subcontinental Central Indo-Aryan':'Indo-European',
    'Indo-European, Indo-Iranian, Indo-Aryan, Rajasthani':'Indo-European',
    'Dravidian, Central Dravidian, Kolami-Naiki':'Dravidian',
    'Indo-European, Indo-Iranian, Indo-Aryan, Marathic':'Indo-European',
    'Dravidian, South Dravidian, Tamil-Kannada, Tamil-Malayalam, Tamiloid':'Dravidian',
    'Dravidian, South Dravidian, Teluguic':'Dravidian',
    'Dravidian, South Dravidian, Gondi':'Dravidian',
    'Dravidian, Parji-Ollari-Gadaba, Ollari-Gadaba':'Dravidian',
    'Austroasiatic, Mundaic, Sora-Juray-Gorum':'Austroasiatic',
    'Dravidian, South Dravidian, Konda-Kui':'Dravidian',
    'Austroasiatic, Mundaic, South Munda, Gutob-Remo':'Austroasiatic',
    'Indo-European, Indo-Iranian, Indo-Aryan, Oriya-Gauda-Kamrupa, Macro-Oriya':'Indo-European',
    'Indo-European, Indo-Iranian, Indo-Aryan, Indo-Aryan Eastern Zone':'Indo-European',
    'Indo-European, Italic, Latinic, Romance, Western Romance, Galician Romance, Macro-Portuguese':'Indo-European',
    'Dravidian, South Dravidian, Tamil-Kannada, Tamil-Malayalam, Malayamoid':'Dravidian',
    'Dravidian, South Dravidian, Tamil-Kannada, Tamil-Toda':'Dravidian',
}

In [48]:
north_india = pd.read_csv('./data/north-india.csv')
south_india = pd.read_csv('./data/south-india.csv')
metadata = pd.read_csv('./data/metadata.csv')
with open('./output/output.json') as json_file:
    codebook = json.load(json_file)

In [49]:
print("Total number of tracks is",len(north_india)+len(south_india))

Total number of tracks is 208


## Draw from Google Sheets

In [50]:
from __future__ import print_function
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly']

# Reference: https://developers.google.com/sheets/api/quickstart/python
def read_google_sheets(SPREADSHEET_ID, RANGE_NAME, HEADER_RANGE):
    creds = None
    # autogenerated
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    service = build('sheets', 'v4', credentials=creds)

    sheet = service.spreadsheets()
    result = sheet.values().get(spreadsheetId=SPREADSHEET_ID,
                                range=RANGE_NAME).execute()
    
    header = sheet.values().get(spreadsheetId=SPREADSHEET_ID,
                               range=HEADER_RANGE).execute()
    
    header_values = header.get('values', [])
    values = result.get('values', [])
    
    return values, header_values

SPREADSHEET_ID = '1AjynK9mMQTw58B_B8b_ZIip3fyUm-aoV7Pp21HziBb0'
RANGE_NAME = 'canto_codings!A2:AT'
HEADER_RANGE = 'canto_codings!A1:AT1'

data, header = read_google_sheets(SPREADSHEET_ID, RANGE_NAME, HEADER_RANGE)

canto_codings = pd.DataFrame(data, columns = header[0])

## Utility Functions

In [51]:
def find_canto_features(canto_coding_id):
    for i,row in canto_codings.iterrows():
        if(str(canto_coding_id) == row['song_id']):
            return row

In [52]:
def get_canto_metadata(canto_coding_id):
    for i, row in metadata.iterrows():
        if(str(canto_coding_id) == row['C-id']):
            return row

In [53]:
def get_display_code(line, binary_code):
    line = codebook['line_'+str(line)]
    for encoding in line:
        if encoding['code'] == str(binary_code):
            return encoding['display_code']

In [54]:
def get_language_family(culture):
    indian_metadata = classfication_metadata[classfication_metadata['Culture']==culture]
    try:
        language = indian_metadata['Lang_Family'].values[0]
        people = indian_metadata['People'].values[0]
        if "(Ethnic Peoples)" in people:
            people = "Tribal"
        else:
            people = "Non-Tribal"
    except IndexError as e:
        return "Dravidian", "Tribal"
    return language_map[language], people

# Prepare Output Data Structure

In [55]:
columns = ["canto_coding_id", "soc_id", "region", "division","subregion", "area_kingdom", "culture", "language", "people","song","lat", "lng"]

for i in range(37):
    columns.append("line_"+str(i+1))
    
north_india_full = pd.DataFrame(columns=columns)
south_india_full = pd.DataFrame(columns=columns)

# Connect Everything

In [71]:
def main(input_matrix, output_matrix):
    for i, row in input_matrix.iterrows():
        # metadata
        canto_coding_id = row['canto_coding_id']
        culture = row['Culture']
        soc_id = row['C_cid']
        meta = get_canto_metadata(int(canto_coding_id))
        lat = meta['Local_lat']
        lng = meta['Local_long']
        region = meta['Region']
        division = meta['Division']
        subregion = meta['Subregion']
        area = meta['Area/Kingdom']
        song = meta['Song']
        language,people = get_language_family(culture)
        canto = find_canto_features(canto_coding_id)
        
        canto_data = []
        for i in range(37):
            canto_data.append(get_display_code(i+1, canto["cv_"+str(i+1)]))
        new_row = pd.DataFrame([[
            canto_coding_id,
            soc_id,
            region,
            division,
            subregion,
            area,
            culture,
            language,
            people,
            song,
            lat,
            lng,
            canto_data[0],
            canto_data[1],
            canto_data[2],
            canto_data[3],
            canto_data[4],
            canto_data[5],
            canto_data[6],
            canto_data[7],
            canto_data[8],
            canto_data[9],
            canto_data[10],
            canto_data[11],
            canto_data[12],
            canto_data[13],
            canto_data[14],
            canto_data[15],
            canto_data[16],
            canto_data[17],
            canto_data[18],
            canto_data[19],
            canto_data[20],
            canto_data[21],
            canto_data[22],
            canto_data[23],
            canto_data[24],
            canto_data[25],
            canto_data[26],
            canto_data[27],
            canto_data[28],
            canto_data[29],
            canto_data[30],
            canto_data[31],
            canto_data[32],
            canto_data[33],
            canto_data[34],
            canto_data[35],
            canto_data[36],
        ]], columns = columns)
        output_matrix = output_matrix.append(new_row)
    return output_matrix

## Run data conversion. This will take a while

In [61]:
north_india_full = main(north_india, north_india_full)

In [None]:
south_india_full = main(south_india, south_india_full)

In [74]:
canto_coding_id

NameError: name 'canto_coding_id' is not defined

## Write data to disk

In [63]:
north_india_full.to_csv('./data/north_india_full.csv', index=False)
south_india_full.to_csv('./data/south_india_full.csv', index=False)

In [69]:
south_india

Unnamed: 0.1,Unnamed: 0,Archival_source#,Area/Kingdom,Audio_file,Audio_notes,canto_coding_id,C_cid,Classification_notes,Culture,Culture_loc,...,Publisher,Recorded_by,Region,Repository,Song,Song_notes,Source Tag,Sources,Subregion,Year
0,4694,T1790.3,Andhra Pradesh/ Madhya Pradesh/ Maharashtra/ T...,T5416R30,,3851,11617.0,,Banjara,"Marathwada Area, India",...,All India Radio,All India Radio,South Asia,"Sound and Central Archives of All India Radio,...","Sewa Bhaya, Sewa Bhaya Seva Kare Re",A harvest song in praise of Sewa Bhaya,@AIR1948-50SewaBhayaSewaBhayaSevaKareRe,,C India/ Central Tribal Area,1948-1950
1,4719,T1786.B8,Chhattisgarh/ Madhya Pradesh/ Maharashtra/ Tel...,T5419R21m,"Poor audio, hum",3844,18778.0,,Kolam,"Waghapur, Andhra Pradesh, India",...,Previously Unpublished,Dept. of Anthropology Government of India,South Asia,"Copy in Alan Lomax Collection, American Folkli...",Warrenga,,@ANSI1953Warrenga,,C India/ Central Tribal Area,1953
2,4720,T1786.B7,Chhattisgarh/ Madhya Pradesh/ Maharashtra/ Tel...,T5419R20m,"Poor audio, hum",3845,18778.0,,Kolam,"Waghapur, Andhra Pradesh, India",...,Previously Unpublished,Dept. of Anthropology Government of India,South Asia,"Copy in Alan Lomax Collection, American Folkli...",Nare Na,Sung by men folk on summer nights,@ANSI1953NareNa,,C India/ Central Tribal Area,1953
3,4721,T1786.B6,Chhattisgarh/ Madhya Pradesh/ Maharashtra/ Tel...,T5419R19m,"Poor audio, hum",3846,18778.0,,Kolam,"Waghapur, Andhra Pradesh, India",...,Previously Unpublished,Dept. of Anthropology Government of India,South Asia,"Copy in Alan Lomax Collection, American Folkli...",Devotional Song,,@ANSI1953DevotionalSong_1,,C India/ Central Tribal Area,1953
4,4743,T1782.1,Maharashtra/ Telangana,T5417R40,,3762,10594.0,,Andh,"Korat, Central India",...,Previously Unpublished,"Department of Anthropology, Government of India",South Asia,"Copy in Alan Lomax Collection, American Folkli...",Cradle Song,"Cradle song, trio of young girls sing in uniso...",@ANSI1953CradleSong_1,,C India/ Central Tribal Area,1953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,4896,T5569.08,Nilgiri Mtns,T5569R08,,4459,27259.0,,Toda,"Ooty and Vicinity, The Nilgiris, Tamil Nadu, I...",...,Previously Unpublished,Andrew Kaye,South Asia,"Copy in Alan Lomax Collection, American Folkli...",Toda Woman,Female soloist,@ALC1989TodaKaye,Copies made for A. L. by Andrew Kaey,Western Ghats,1989
83,4897,T5569.10,Nilgiri Mtns,T5569R10m,,4460,27259.0,,Toda,"Toror, Near Glen Morgan, The Nilgiris, Tamil N...",...,Previously Unpublished,Andrew Kaye,South Asia,"Copy in Alan Lomax Collection, American Folkli...",Wedding Song,Wedding song,@ALC1989TodaKaye,Copies made for A. L. by Andrew Kaey,Western Ghats,1989
84,4898,T5569.13,Nilgiri Mtns,T5569R13,,4461,27259.0,,Toda,"Toror, Near Glen Morgan, The Nilgiris, Tamil N...",...,Previously Unpublished,Andrew Kaye,South Asia,"Copy in Alan Lomax Collection, American Folkli...",Farewell Song After Wedding,Farewell song,@ALC1989TodaKaye,Copies made for A. L. by Andrew Kaey,Western Ghats,1989
85,4899,T5569.15,Nilgiri Mtns,T5569R15,,4462,27259.0,,Toda,"Toror, Near Glen Morgan, The Nilgiris, Tamil N...",...,Previously Unpublished,Andrew Kaye,South Asia,"Copy in Alan Lomax Collection, American Folkli...",Post Wedding Circle Dance,Men's circle dance song,@ALC1989TodaKaye,,Western Ghats,1989


In [64]:
region = []
for i in range(len(north_india_full)):
    region.append("North")
for i in range(len(south_india_full)):
    region.append("South")

frames = [north_india_full, south_india_full]
all_india_full = pd.concat(frames)
all_india_full["region"] = region
all_india_full = all_india_full[all_india_full["culture"]!="Portuguese Goa"]
all_india_full.to_csv('./data/all_india_full.csv', index=False)

In [65]:
all_india_full = pd.read_csv('./data/all_india_full.csv')
all_india_full['language'].value_counts()

Dravidian        48
Indo-European    38
Austroasiatic    35
Name: language, dtype: int64

In [19]:
all_india_full['people'].value_counts()

Tribal        150
Non-Tribal     57
Name: people, dtype: int64

In [20]:
c = 0
for i, row in all_india_full.iterrows():
    if row['language'] != 'Austroasiatic' and row['people']=='Non-Tribal':
        c+=1
c

47

## Important: Change the file's content so that Tamil People are coded as dravidians!

# Preparing Data for AMOVA for Indian Subset Sample in R

## Removing Cultures With Only One Sample

In [21]:
from scipy.spatial import distance_matrix
import math
import numpy as np

## Samples Dataframe

In [22]:
all_india_full = pd.read_csv('./data/all_india_full.csv')

In [23]:
# This value was missing. 
all_india_full['cv_1'][196] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_india_full['cv_1'][196] = 2


In [24]:
all_india_full['Number of Samples'] = all_india_full.groupby('culture')['culture'].transform('count')
no_singles = all_india_full[all_india_full['Number of Samples']!=1]
singles = all_india_full[all_india_full['Number of Samples']==1]
filtered_df = all_india_full[(all_india_full['language']!='Austroasiatic') & (all_india_full['people']!='Tribal')]

filtered_df.to_csv('./data/filtered.csv')

In [25]:
sample_columns = filtered_df['culture'].unique()
sample_columns = np.insert(sample_columns, 0, 'canto_coding_id', axis=0)
samples = pd.DataFrame(columns=sample_columns)
## fill this up with zeros

for i, row in filtered_df.iterrows():
    samples.loc[i, row['culture']] = 1
    samples.loc[i, 'canto_coding_id'] = row["canto_coding_id"]
    
samples = samples.fillna(0)
samples = samples.reset_index()
samples = samples.drop(columns=['index'])

samples.to_csv('./data/samples.csv')

## Samples for subset of features

In [28]:
# drop<- c('cv_1','cv_5','cv_6','cv_12','cv_22','cv_2','cv_3','cv_8','cv_9','cv_13','cv_14','cv_27')

Index(['canto_coding_id', 'region', 'division', 'subregion', 'area_kingdom',
       'culture', 'language', 'people', 'song', 'lat', 'lng', 'cv_1', 'cv_2',
       'cv_3', 'cv_4', 'cv_5', 'cv_6', 'cv_7', 'cv_8', 'cv_9', 'cv_10',
       'cv_11', 'cv_12', 'cv_13', 'cv_14', 'cv_15', 'cv_16', 'cv_17', 'cv_18',
       'cv_19', 'cv_20', 'cv_21', 'cv_22', 'cv_23', 'cv_24', 'cv_25', 'cv_26',
       'cv_27', 'cv_28', 'cv_29', 'cv_30', 'cv_31', 'cv_32', 'cv_33', 'cv_34',
       'cv_35', 'cv_36', 'cv_37', 'Number of Samples'],
      dtype='object')

## Structures Dataframe

In [34]:
structure_list = []
for i in range(len(sample_columns)-1):
    structure_list.append(filtered_df[filtered_df['culture']==sample_columns[i+1]]['language'].all())
structure = pd.DataFrame(structure_list, columns = ['language'])
structure = structure['language'].replace('Indo-European', 'Indo-Aryan')
structure.to_csv('./data/structure.csv')

In [36]:
samples.columns

Index(['canto_coding_id', 'C Indian Folk', 'Uttar Pradesh', 'Benares',
       'Rajasthan', 'Saurashtra', 'Banjara', 'Andh', 'Madras', 'Kerala'],
      dtype='object')

# Preparing Data for AMOVA for Indian Full Sample in R

## Removing Cultures With Only One Sample

In [37]:
all_india_full = pd.read_csv('./data/all_india_full.csv')
# This value was missing. 
all_india_full.loc[196,('cv_1')] = 2
all_india_full['Number of Samples'] = all_india_full.groupby('culture')['culture'].transform('count')
no_singles = all_india_full[all_india_full['Number of Samples']!=1]
no_singles.to_csv('./data/no_singles_full.csv')

In [38]:
sample_columns = no_singles['culture'].unique()
sample_columns = np.insert(sample_columns, 0, 'canto_coding_id', axis=0)
samples = pd.DataFrame(columns=sample_columns)
## fill this up with zeros

for i, row in no_singles.iterrows():
    samples.loc[i, row['culture']] = 1
    samples.loc[i, 'canto_coding_id'] = row["canto_coding_id"]
    
samples = samples.fillna(0)
samples = samples.reset_index()
samples = samples.drop(columns=['index'])

samples.to_csv('./data/samples_full.csv')

## Structures Dataframe

In [39]:
structure_list = []
for i in range(len(sample_columns)-1):
    structure_list.append(no_singles[no_singles['culture']==sample_columns[i+1]]['language'].all())
structure = pd.DataFrame(structure_list, columns = ['language'])
structure = structure['language'].replace('Indo-European', 'Indo-Aryan')
structure.to_csv('./data/structure_full.csv')

In [44]:
samples.columns

Index(['canto_coding_id', 'Dhurwa Gond', 'Dorla', 'Muria Gond', 'Gond',
       'Korku', 'Bhumij', 'Kurukh', 'Munda', 'Bhojpuri', 'Uttar Pradesh',
       'Benares', 'Rajasthan', 'Kolam', 'Tamil People', 'Madras',
       'Andhra Pradesh', 'Koya', 'Gadaba', 'Hill Saora', 'Konda-Dora', 'Bonda',
       'Oriya', 'Paroja', 'Kerala', 'Toda'],
      dtype='object')

# Prepare AMOVA Data for Global Sample in R
## Remove cultures with Only One Sample

In [49]:
canto_codings['Number of Samples'] = canto_codings.groupby('Culture')['Culture'].transform('count')
singles = canto_codings[canto_codings['Number of Samples']!=1]
singles.to_csv('./data/global_no_singles.csv')

## Appending regions to the canto_codings dataframe

In [50]:
canto_codings['region'] = metadata['Region']

## Samples Dataframe
### This will take a while to run

In [29]:
sample_columns = singles['Culture'].unique()
sample_columns = np.insert(sample_columns, 0, 'canto_coding_id', axis=0)
samples = pd.DataFrame(columns=sample_columns)
## fill this up with zeros

for i, row in singles.iterrows():
    samples.loc[i, row['Culture']] = 1
    samples.loc[i, 'canto_coding_id'] = row["canto_coding_id"]
samples = samples.fillna(0)
samples.to_csv('./data/global_samples.csv')

In [30]:
## Structures Dataframe

In [31]:
structure_list = []
for i in range(len(sample_columns)-1):
    structure_list.append(canto_codings[canto_codings['Culture']==sample_columns[i+1]]['region'].all())
structure = pd.DataFrame(structure_list, columns = ['region'])
structure.to_csv('./data/global_structure.csv')

In [152]:
filtered_df['language'].value_counts()

Indo-European    28
Dravidian        19
Name: language, dtype: int64