# Region and The World Color Survey: Final Project

COG 260: Data, Computation, and The Mind (Yang Xu)

Data source: http://www1.icsi.berkeley.edu/wcs/data.html

______________________________________________

Import helper function file for WCS data analysis.

In [21]:
from wcs_helper_functions import *

Import relevant Python libraries.

In [22]:
import numpy as np
from scipy import stats
from random import random
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt



`munsellInfo` is a 2-element tuple with dictionary elements.

In [23]:
munsellInfo = readChipData('./WCS_data_core/chip.txt')


`namingData` is a hierarchical dictionary organized as follows:

**language _(1 - 110)_ &rarr; speaker _(1 - *range varies per language*)_ &rarr; chip index _(1 - 330)_ &rarr; color term**

In [24]:
namingData = readNamingData('./WCS_data_core/term.txt')

In [36]:
fociData = readFociData('./WCS_data_core/foci-exp.txt')
 



In [26]:
# Helper fucntion for extracting the language and the region
def readlanguageData(namingDataFilePath):
    namingData = {}  # empty dict
    fileHandler = open(namingDataFilePath,'r')

    for line in fileHandler:              			# for each line in the file
        lineElements = line.split()     			# lineElements are denoted by white space
        
    

        languageNumber = int(lineElements[0])    	
        languageName = str(lineElements[1])  		
        country = str(lineElements[2])   

        if not (languageNumber in namingData.keys()):    						# if this language isn't a key in the namingData dict
            namingData[languageNumber] = {}

        namingData[languageNumber][languageName] =  country

    fileHandler.close()				# close file after reading it in, for neatness
    return namingData

languageData = readlanguageData('./WCS_data_core/language.txt')
languageData

{1: {'Abidji': 'Ivory'},
 2: {'Agarabi': 'Papua'},
 3: {'Casiguran': 'Philippines'},
 4: {'Aguacateco': 'Guatemala'},
 5: {'Amarakaeri': 'Peru'},
 6: {'Ampeeli': 'Papua'},
 7: {'Amuzgo': 'Mexico'},
 8: {'Angaatiha': 'Papua'},
 9: {'Apinaye': 'Brazil'},
 10: {'Arabela': 'Peru'},
 11: {'Bahinemo': 'Papua'},
 12: {'Bauzi': 'Indonesia'},
 13: {'Berik': 'Indonesia'},
 14: {'Bete': 'Ivory'},
 15: {'Bhili': 'India'},
 16: {'Buglere': 'Panama'},
 17: {'Cakchiquel': 'Guatemala'},
 18: {'Ucayali_Campa': 'Peru'},
 19: {'Camsa': 'Colombia'},
 20: {'Candoshi': 'Peru'},
 21: {'Cavinea': 'Bolivia'},
 22: {'Cayapa': 'Ecuador'},
 23: {'Chcobo': 'Bolivia'},
 24: {'Chavacano': 'Philippines'},
 25: {'Chayahuita': 'Peru'},
 26: {'Chinanteco': 'Mexico'},
 27: {'Chiquitano': 'Bolivia'},
 28: {'Chumburu': 'Ghana'},
 29: {'Cofan': 'Ecuador'},
 30: {'Colorado': 'Nevada'},
 31: {'Eastern_Cree': 'Canada'},
 32: {'Culina': 'Brazil'},
 33: {'Didinga': 'Sudan'},
 34: {'Djuka': 'Suriname'},
 35: {'Dyimini': 'Ivory'},

# Color Divisions
Red is (1 - 4), 
Red/Yellow ( 5 - 8), 
Yellow ( 9 - 12), 
Green/Yellow ( 13 - 16), 
Green (17 - 20), 
Blue/Green(21 - 24), 
Blue (25 - 28), 
Blue/Purple (29 -32), 
Purple (33 - 36), 
Red/Purple (37 - 40), 

In [69]:
# here we want to look at the foci data and get all the eperimental data for each language that is Blue/Green (colour chip  15 -30)
red_cc = {} 
redyellow_cc = {}
yellow_cc = {}
greenyellow_cc = {}
green_cc = {}
bluegreen_cc = {}
blue_cc = {}
purpleblue_cc = {}
purple_cc = {}
redpurple_cc = {}
white_cc = {}
black_cc = {}

color_chips = [red_cc, redyellow_cc, yellow_cc, greenyellow_cc, green_cc, bluegreen_cc, blue_cc, purpleblue_cc, purple_cc, redpurple_cc, white_cc, black_cc]
color_chip_names = ['red_cc', 'redyellow_cc', 'yellow_cc', 'greenyellow_cc', 'green_cc', 'bluegreen_cc', 'blue_cc', 'purpleblue_cc', 'purple_cc', 'redpurple_cc', 'white_cc', 'black_cc']
for language in fociData:
    red = np.array([])
    redyellow = np.array([])
    yellow = np.array([])
    greenyellow = np.array([])
    green = np.array([])
    bluegreen = np.array([])
    blue = np.array([])
    purpleblue = np.array([])
    purple = np.array([])
    redpurple = np.array([])
    white = np.array([])
    black = np.array([])
    for speaker in fociData[language]:
        for chip in fociData[language][speaker]:
            for i in range(len(fociData[language][speaker][chip])):
                colour = int(fociData[language][speaker][chip][i].split(':')[1])
                if colour == 0:
                    if fociData[language][speaker][chip][i].split(':')[0] == 'A':
                        white = np.append(white, fociData[language][speaker][chip][i])
                    elif fociData[language][speaker][chip][i].split(':')[0] == 'B':
                        black = np.append(black, fociData[language][speaker][chip][i])
                elif 4 >= colour >= 1 :
                    red = np.append(red, fociData[language][speaker][chip][i])
                    
                elif 8 >= colour >= 5:
                    redyellow = np.append(redyellow, fociData[language][speaker][chip][i])
                elif 12 >= colour >= 9:
                    yellow = np.append(yellow, fociData[language][speaker][chip][i])
                elif 16 >= colour >= 13:
                    greenyellow = np.append(greenyellow, fociData[language][speaker][chip][i])
                elif 20 >= colour >= 17:
                    green = np.append(green, fociData[language][speaker][chip][i])
                elif 24 >= colour >= 21:
                    bluegreen = np.append(bluegreen, fociData[language][speaker][chip][i])
                elif 28 >= colour >= 25:
                    blue = np.append(blue, fociData[language][speaker][chip][i])
                elif 32 >= colour >= 29:
                    purpleblue = np.append(purpleblue, fociData[language][speaker][chip][i])
                elif 36 >= colour >= 33:
                    purple = np.append(purple, fociData[language][speaker][chip][i])
                elif 40 >= colour >= 37:
                    redpurple = np.append(redpurple, fociData[language][speaker][chip][i])
            
    red_cc[language] = len(set(red))
    redyellow_cc[language] = len(set(redyellow))
    yellow_cc[language] = len(set(yellow))
    greenyellow_cc[language] = len(set(greenyellow))
    green_cc[language] = len(set(green))
    bluegreen_cc[language] = len(set(bluegreen))
    blue_cc[language] = len(set(blue))
    purpleblue_cc[language] = len(set(purpleblue))
    purple_cc[language] = len(set(purple))
    redpurple_cc[language] = len(set(redpurple))
    white_cc[language] = len(set(white))
    black_cc[language] = len(set(black))

for i in range(len(color_chips)):
    print(str(color_chip_names[i]) + ': ' + str(color_chips[i]))


red_cc: {1: 24, 2: 19, 3: 15, 4: 24, 5: 9, 6: 22, 7: 19, 8: 13, 9: 17, 10: 29, 11: 15, 12: 17, 13: 13, 14: 9, 15: 19, 16: 14, 17: 32, 18: 29, 19: 16, 20: 13, 21: 7, 22: 16, 23: 5, 24: 18, 25: 6, 26: 19, 27: 18, 28: 15, 29: 11, 30: 8, 31: 6, 32: 32, 33: 5, 34: 19, 35: 11, 36: 8, 37: 9, 38: 22, 39: 19, 40: 10, 41: 16, 42: 16, 43: 27, 44: 10, 45: 12, 46: 16, 47: 6, 48: 11, 49: 18, 50: 10, 51: 32, 52: 24, 53: 17, 54: 13, 55: 16, 56: 10, 57: 27, 58: 15, 59: 16, 60: 12, 61: 7, 62: 13, 63: 30, 64: 26, 65: 14, 66: 31, 67: 22, 68: 16, 69: 13, 70: 11, 71: 9, 72: 25, 73: 9, 74: 8, 75: 14, 76: 19, 77: 18, 78: 4, 79: 10, 80: 12, 81: 7, 82: 7, 83: 32, 84: 16, 85: 12, 86: 4, 87: 14, 88: 21, 89: 18, 90: 17, 91: 8, 92: 19, 93: 15, 94: 9, 95: 19, 96: 9, 97: 19, 98: 27, 99: 8, 100: 6, 101: 12, 102: 20, 103: 20, 104: 7, 105: 6, 106: 20, 107: 16, 108: 8, 109: 21, 110: 12}
redyellow_cc: {1: 16, 2: 19, 3: 7, 4: 29, 5: 5, 6: 17, 7: 18, 8: 3, 9: 6, 10: 28, 11: 7, 12: 10, 13: 7, 14: 3, 15: 10, 16: 5, 17: 31, 18

In [70]:
red_ct = {}
redyellow_ct = {}
yellow_ct = {}
greenyellow_ct = {}
green_ct = {}
bluegreen_ct = {}
blue_ct = {}
purpleblue_ct = {}
purple_ct = {}
redpurple_ct = {}
white_ct = {}
black_ct = {}

color_terms = [red_ct, redyellow_ct, yellow_ct, greenyellow_ct, green_ct, bluegreen_ct, blue_ct, purpleblue_ct, purple_ct, redpurple_ct, white_ct, black_ct]
color_term_names = ['red_ct', 'redyellow_ct', 'yellow_ct', 'greenyellow_ct', 'green_ct', 'bluegreen_ct', 'blue_ct', 'purpleblue_ct', 'purple_ct', 'redpurple_ct', 'white_ct', 'black_ct']



for language in fociData:
    red = np.array([])
    redyellow = np.array([])
    yellow = np.array([])
    greenyellow = np.array([])
    green = np.array([])
    bluegreen = np.array([])
    blue = np.array([])
    purpleblue = np.array([])
    purple = np.array([])
    redpurple = np.array([])
    white = np.array([])
    black = np.array([])
    for speaker in fociData[language]:
        for chip in fociData[language][speaker]:
            for i in range(len(fociData[language][speaker][chip])):
                colour = int(fociData[language][speaker][chip][i].split(':')[1])
                if colour == 0:
                    if fociData[language][speaker][chip][i].split(':')[0] == 'A':
                        dictionary = fociData[language][speaker]
                        value = fociData[language][speaker][chip]
                        for  key in dictionary.keys():
                            if value == dictionary[key]:
                                white = np.append(white, key)
                    elif fociData[language][speaker][chip][i].split(':')[0] == 'B':
                        dictionary = fociData[language][speaker]
                        value = fociData[language][speaker][chip]
                        for  key in dictionary.keys():
                            if value == dictionary[key]:
                                black = np.append(black, key)
                elif 4 >= colour >= 1 :
                    dictionary = fociData[language][speaker]
                    value = fociData[language][speaker][chip]
                    for  key in dictionary.keys():
                        if value == dictionary[key]:
                            red = np.append(red, key)
                elif 8 >= colour >= 5:
                    dictionary = fociData[language][speaker]
                    value = fociData[language][speaker][chip]
                    for  key in dictionary.keys():
                        if value == dictionary[key]:
                            redyellow = np.append(redyellow, key)
                elif 12 >= colour >= 9:
                    dictionary = fociData[language][speaker]
                    value = fociData[language][speaker][chip]
                    for  key in dictionary.keys():
                        if value == dictionary[key]:
                            yellow = np.append(yellow, key)
                elif 16 >= colour >= 13:
                    dictionary = fociData[language][speaker]
                    value = fociData[language][speaker][chip]
                    for  key in dictionary.keys():
                        if value == dictionary[key]:
                            greenyellow = np.append(greenyellow, key)
                elif 20 >= colour >= 17:
                    dictionary = fociData[language][speaker]
                    value = fociData[language][speaker][chip]
                    for  key in dictionary.keys():
                        if value == dictionary[key]:
                            green = np.append(green, key)
                elif 24 >= colour >= 21:
                    dictionary = fociData[language][speaker]
                    value = fociData[language][speaker][chip]
                    for  key in dictionary.keys():
                        if value == dictionary[key]:
                            bluegreen = np.append(bluegreen, key)
                elif 28 >= colour >= 25:
                    dictionary = fociData[language][speaker]
                    value = fociData[language][speaker][chip]
                    for  key in dictionary.keys():
                        if value == dictionary[key]:
                            blue = np.append(blue, key)
                elif 32 >= colour >= 29:
                    dictionary = fociData[language][speaker]
                    value = fociData[language][speaker][chip]
                    for  key in dictionary.keys():
                        if value == dictionary[key]:
                            purpleblue = np.append(purpleblue, key)
                elif 36 >= colour >= 33:
                    dictionary = fociData[language][speaker]
                    value = fociData[language][speaker][chip]
                    for  key in dictionary.keys():
                        if value == dictionary[key]:
                            purple = np.append(purple, key)
                elif 40 >= colour >= 37:
                    dictionary = fociData[language][speaker]
                    value = fociData[language][speaker][chip]
                    for  key in dictionary.keys():
                        if value == dictionary[key]:
                            redpurple = np.append(redpurple, key)
                
    
    red_ct[language] = len(set(red))
    redyellow_ct[language] = len(set(redyellow))
    yellow_ct[language] = len(set(yellow))
    greenyellow_ct[language] = len(set(greenyellow))
    green_ct[language] = len(set(green))
    bluegreen_ct[language] = len(set(bluegreen))
    blue_ct[language] = len(set(blue))
    purpleblue_ct[language] = len(set(purpleblue))
    purple_ct[language] = len(set(purple))
    redpurple_ct[language] = len(set(redpurple))
    white_ct[language] = len(set(white))
    black_ct[language] = len(set(black))

for i in range(len(color_terms)):
    print(str(color_term_names[i]) + ': ' + str(color_terms[i]))




red_ct: {1: 4, 2: 8, 3: 4, 4: 4, 5: 2, 6: 12, 7: 8, 8: 3, 9: 2, 10: 9, 11: 2, 12: 2, 13: 3, 14: 2, 15: 6, 16: 2, 17: 9, 18: 9, 19: 8, 20: 3, 21: 1, 22: 5, 23: 1, 24: 5, 25: 2, 26: 5, 27: 16, 28: 3, 29: 3, 30: 1, 31: 1, 32: 10, 33: 2, 34: 9, 35: 3, 36: 1, 37: 4, 38: 7, 39: 4, 40: 2, 41: 5, 42: 13, 43: 6, 44: 4, 45: 1, 46: 4, 47: 3, 48: 5, 49: 5, 50: 1, 51: 11, 52: 8, 53: 2, 54: 4, 55: 5, 56: 4, 57: 8, 58: 5, 59: 8, 60: 2, 61: 2, 62: 4, 63: 14, 64: 15, 65: 6, 66: 5, 67: 15, 68: 5, 69: 3, 70: 2, 71: 2, 72: 6, 73: 5, 74: 1, 75: 4, 76: 12, 77: 2, 78: 2, 79: 2, 80: 3, 81: 2, 82: 3, 83: 4, 84: 8, 85: 4, 86: 2, 87: 2, 88: 7, 89: 6, 90: 4, 91: 5, 92: 5, 93: 6, 94: 2, 95: 5, 96: 2, 97: 10, 98: 7, 99: 1, 100: 1, 101: 4, 102: 9, 103: 11, 104: 1, 105: 1, 106: 5, 107: 3, 108: 1, 109: 5, 110: 2}
redyellow_ct: {1: 3, 2: 7, 3: 2, 4: 8, 5: 2, 6: 7, 7: 6, 8: 2, 9: 3, 10: 8, 11: 4, 12: 1, 13: 7, 14: 3, 15: 4, 16: 2, 17: 10, 18: 8, 19: 5, 20: 2, 21: 4, 22: 4, 23: 1, 24: 5, 25: 2, 26: 4, 27: 13, 28: 8, 29: 

In [73]:
# get the total number of of unique terms for each language

total_terms = {}
for colour in color_terms:
    for language in colour:
        if language in total_terms:
            total_terms[language] += colour[language]
        else:
            total_terms[language] = colour[language]

In [78]:
# sort the languages by the total number of terms
sorted_total_terms = sorted(total_terms.items(), key=lambda kv: kv[1])

In [79]:
# a function that checks if a language has a term for a colour and returns 1 if it does and 0 if it doesn't
def check_term(language, colour):
    value = 0
    if colour == 'red':
        if red_ct[language] > 0:
            value = 1
        if redyellow_ct[language] > 0:
            value = 1
        if redpurple_ct[language] > 0:
            value = 1
    elif colour == 'yellow':
        if yellow_ct[language] > 0:
            value = 1
        if redyellow_ct[language] > 0:
            value = 1
        if greenyellow_ct[language] > 0:
            value = 1
    elif colour == 'green':
        if green_ct[language] > 0:
            value = 1
        if greenyellow_ct[language] > 0:
            value = 1
        if bluegreen_ct[language] > 0:
            value = 1
    elif colour == 'blue':
        if blue_ct[language] > 0:
            value = 1
        if bluegreen_ct[language] > 0:
            value = 1
        if purpleblue_ct[language] > 0:
            value = 1
    elif colour == 'purple':
        if purple_ct[language] > 0:
            value = 1
        if purpleblue_ct[language] > 0:
            value = 1
        if redpurple_ct[language] > 0:
            value = 1
    elif colour == 'white':
        if white_ct[language] > 0:
            value = 1
    elif colour == 'black':
        if black_ct[language] > 0:
            value = 1
    return value


In [89]:
# creates a dictionary of the sorted languages and if they have a term for a colour
binary_table = {}
for language in sorted_total_terms:
    binary_table[language[0]] = {}
    binary_table[language[0]]['red'] = check_term(language[0], 'red')
    binary_table[language[0]]['yellow'] = check_term(language[0], 'yellow')
    binary_table[language[0]]['green'] = check_term(language[0], 'green')
    binary_table[language[0]]['blue'] = check_term(language[0], 'blue')
    binary_table[language[0]]['purple'] = check_term(language[0], 'purple')
    binary_table[language[0]]['white'] = check_term(language[0], 'white')
    binary_table[language[0]]['black'] = check_term(language[0], 'black')

# creates a dataframe from the binary table
binary_table_df = pd.DataFrame(binary_table)
binary_table_df = binary_table_df.T

binary_table_df.to_csv('sorted_binary_table.csv', index=False)
binary_table_df

Unnamed: 0,red,yellow,green,blue,purple,white,black
105,1,1,0,0,0,1,0
104,1,0,0,1,1,1,0
36,1,1,0,0,1,1,0
74,1,1,1,1,1,1,1
33,1,1,1,0,1,1,0
...,...,...,...,...,...,...,...
10,1,1,1,1,1,1,1
67,1,1,1,1,1,1,1
103,1,1,1,1,1,1,1
32,1,1,1,1,1,1,1


# Data Visualization
Here we are going to look at different colour groups and how their proportions relate to one another


# Data Summary

In [88]:
# Make a dataframe with each language and the number of each colour group of terms they have

df = pd.DataFrame({'language': list(red_ct.keys()), 'red_ct': list(red_ct.values()), 'redyellow_ct': list(redyellow_ct.values()), 'yellow_ct': list(yellow_ct.values()), 'greenyellow_ct': list(greenyellow_ct.values()), 'green_ct': list(green_ct.values()), 'bluegreen_ct': list(bluegreen_ct.values()), 'blue_ct': list(blue_ct.values()), 'purpleblue_ct': list(purpleblue_ct.values()), 'purple_ct': list(purple_ct.values()), 'redpurple_ct': list(redpurple_ct.values()), 'white_ct': list(white_ct.values()), 'black_ct': list(black_ct.values()), 'total_terms': list(total_terms.values())})
df  


Unnamed: 0,language,red_ct,redyellow_ct,yellow_ct,greenyellow_ct,green_ct,bluegreen_ct,blue_ct,purpleblue_ct,purple_ct,redpurple_ct,white_ct,black_ct,total_terms
0,1,4,3,3,3,3,2,5,4,4,4,1,1,37
1,2,8,7,6,10,5,4,5,6,5,5,3,0,64
2,3,4,2,1,2,5,4,4,2,1,3,1,0,29
3,4,4,8,4,5,3,6,6,6,5,5,1,1,54
4,5,2,2,3,1,3,1,3,3,2,2,2,1,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,106,5,4,3,2,1,1,1,3,4,5,1,0,30
106,107,3,4,4,3,2,3,3,4,4,2,2,0,34
107,108,1,2,1,3,1,0,1,2,1,2,1,0,15
108,109,5,8,7,3,4,3,6,4,4,5,2,1,52
