In [2]:
## Self-Organizing Maps by Paras Chopra
## www.paraschopra.com
## paras1987@gmail.com
##
## Please give credit if you use my work.

from random import *
from math import *
import numpy as np
import csv 

class Node:
    def __init__(self, FV_size=10, PV_size=10, Y=0, X=0):
        self.FV_size = FV_size
        self.PV_size = PV_size
        self.FV = [0.0] * FV_size  # Feature Vector
        self.PV = [0.0] * PV_size  # Prediction Vector
        self.X = X  # X location
        self.Y = Y  # Y location

        for i in range(FV_size):
            self.FV[i] = random()  # Assign a random number from 0 to 1

        for i in range(PV_size):
            self.PV[i] = random()  # Assign a random number from 0 to 1


class SOM:

    # Let radius=False if you want to autocalculate the radis
    def __init__(
        self,
        height=10,
        width=10,
        FV_size=10,
        PV_size=10,
        radius=False,
        learning_rate=0.005,
    ):
        self.height = height
        self.width = width
        self.radius = radius if radius else (height + width) / 2
        self.total = height * width
        self.learning_rate = learning_rate
        self.nodes = [0] * (self.total)
        self.FV_size = FV_size
        self.PV_size = PV_size
        for i in range(self.height):
            for j in range(self.width):
                self.nodes[(i) * (self.width) + j] = Node(FV_size, PV_size, i, j)

    # Train_vector format: [ [FV[0], PV[0]],
    #                        [FV[1], PV[1]], so on..

    def train(self, iterations=1000, train_vector=[[[0.0], [0.0]]]):
        time_constant = iterations / log(self.radius)
        radius_decaying = 0.0
        learning_rate_decaying = 0.0
        influence = 0.0
        stack = []  # Stack for storing best matching unit's index and updated FV and PV
        temp_FV = [0.0] * self.FV_size
        temp_PV = [0.0] * self.PV_size
        for i in range(1, iterations + 1):
            # print "Iteration number:",i
            radius_decaying = self.radius * exp(-1.0 * i / time_constant)
            learning_rate_decaying = self.learning_rate * exp(-1.0 * i / time_constant)
            print(i, end=", ")
            if i % 50 == 0:
                print("")

            for j in range(len(train_vector)):
                input_FV = train_vector[j][0]
                input_PV = train_vector[j][1]
                best = self.best_match(input_FV)
                stack = []
                for k in range(self.total):
                    dist = self.distance(self.nodes[best], self.nodes[k])
                    if dist < radius_decaying:
                        temp_FV = [0.0] * self.FV_size
                        temp_PV = [0.0] * self.PV_size
                        influence = exp(
                            (-1.0 * (dist ** 2)) / (2 * radius_decaying * i)
                        )

                        for l in range(self.FV_size):
                            # Learning
                            temp_FV[l] = self.nodes[k].FV[
                                l
                            ] + influence * learning_rate_decaying * (
                                input_FV[l] - self.nodes[k].FV[l]
                            )

                        for l in range(self.PV_size):
                            # Learning
                            temp_PV[l] = self.nodes[k].PV[
                                l
                            ] + influence * learning_rate_decaying * (
                                input_PV[l] - self.nodes[k].PV[l]
                            )

                        # Push the unit onto stack to update in next interval
                        stack[0:0] = [[[k], temp_FV, temp_PV]]

                for l in range(len(stack)):

                    self.nodes[stack[l][0][0]].FV[:] = stack[l][1][:]
                    self.nodes[stack[l][0][0]].PV[:] = stack[l][2][:]

    # Returns prediction vector
    def predict(self, FV=[0.0], get_ij=False):
        best = self.best_match(FV)
        if get_ij:
            return self.nodes[best].PV, self.nodes[best].X, self.nodes[best].Y
        return self.nodes[best].PV

    # Returns best matching unit's index
    def best_match(self, target_FV=[0.0]):

        minimum = sqrt(self.FV_size)  # Minimum distance
        minimum_index = 1  # Minimum distance unit
        temp = 0.0
        for i in range(self.total):
            temp = 0.0
            temp = self.FV_distance(self.nodes[i].FV, target_FV)
            if temp < minimum:
                minimum = temp
                minimum_index = i

        return minimum_index

    def FV_distance(self, FV_1=[0.0], FV_2=[0.0]):
        temp = 0.0
        for j in range(self.FV_size):
            temp = temp + (FV_1[j] - FV_2[j]) ** 2

        temp = sqrt(temp)
        return temp

    def distance(self, node1, node2):
        return sqrt((node1.X - node2.X) ** 2 + (node1.Y - node2.Y) ** 2)

#Receives the countries info (x, y, and country tag)
def printCountries(countries):
    result = []
    #Creating a matrix of 20 per 20 elements
    for i in range(0, 20):
        row = []
        for j in range(0, 20):
            row.append(" X ")
        result.append(row)

    #In position x, y we replace by the country tag
    for x, y in countries.keys():
        result[x][y] = countries[(x, y)]    

    #Printing the result
    for i in range(0, 20):
        print(f"{result[i]}")

    
#The training vector contains the next information:
#Access to electricity (% of population)
#Surface area (sq. km)
#Scientific and technical journal articles
#Rural population
#Population, total
#Population, male
#Population, female
#Military expenditure (% of GDP)
#Imports of goods and services (% of GDP)
#Armed forces personnel, total

import csv 

indicators = [[], [], [], [], [], [], [], [], [], []]

contryTrainingVector = []
setCountryTag = set()
countryTag = []

#We will use csv library to get the data from de wold bank
with open('data.csv', mode='r') as csvFile:
    #This object allow us "move" in the CSV (row by row)
    csvReader = csv.DictReader(csvFile)
    lineCount = 0
    for row in csvReader:
        #If we are in the header we can't get data so we ignore it
        if lineCount == 0:
            lineCount += 1
        #We get the value of the current indicator
        currValue = row["2015 [YR2015]"]
        #We get the country code of the current country
        currCountryCode = row["Country Code"]
        #If this one is zero it means that we are in the headers row
        if len(currValue) > 0:
            #We add the current indicator to our indicators matrix
            #First we're "sorting" the indicators by category
            #As we known we have 10 indicators that's why we're using modulo 10
            #And we known that from line 1 we have information about the indicators
            #That's why we must substract 1 from the line count
            #Finally use row["column"] extracts an string that's why we must cast to a float
            indicators[(lineCount - 1) % 10].append(float(row["2015 [YR2015]"]))
            #As every row contains the country code, in order to avoid repeated tags
            #we're using a set just to ask if we already visited or added a tag.
            if not currCountryCode in setCountryTag:
                setCountryTag.add(currCountryCode)
                countryTag.append(currCountryCode)
        lineCount += 1

#We normalize the data getting the maximum of every indicators category
#and dividing by that maximum every indicators element (we'll get a value between 0 and 1)
for i in range(0, 10):
    maxValue = max(indicators[i])
    for j in range(0, 40):
        indicators[i][j] /= maxValue

#Now ge build the training vector we have 40 elements per category
#The j-th indicator belongs to the i-th country 
for i in range(0, 40):
    row = []
    for j in range(0, 10):
        row.append(indicators[j][i])
    #To add elements to our training vector we must add an "id"
    #So the structure is [indicatorsVector, [id]]
    contryTrainingVector.append([row, [i]])

print("Initialization...")
a = SOM(20, 20, 10, 1, False, 0.03)

print("Training...")
a.train(500, contryTrainingVector)

positions = {}

#We get the BMU (Best Matching Unit)
for i in range(0, 40):
    #We get every country indicators
    currCountry = contryTrainingVector[i][0]
    #We get the BMU it returns the value, and position 
    value, x, y = a.predict(currCountry, True)
    #Can exist repeated element's we use the last one
    positions[(x, y)] = countryTag[i]

#We print the number of countries that we get
print(f"He wave {len(positions)} entries.")
#We print a matrix with the countries
printCountries(positions)


Initialization...
Training...
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 
51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 
151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 
201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215,

In [27]:
import csv 

indicators = [[], [], [], [], [], [], [], [], [], []]

contryTrainingVector = []
setCountryTag = set()
countryTag = []

with open('data.csv', mode='r') as csvFile:
    csvReader = csv.DictReader(csvFile)
    lineCount = 0
    for row in csvReader:
        if lineCount == 0:
            lineCount += 1
        currValue = row["2015 [YR2015]"]
        currCountryCode = row["Country Code"]
        if len(currValue) > 0:
            indicators[(lineCount - 1) % 10].append(float(row["2015 [YR2015]"]))
            if not currCountryCode in setCountryTag:
                setCountryTag.add(currCountryCode)
                countryTag.append(currCountryCode)

        lineCount += 1

for i in range(0, 10):
    maxValue = max(indicators[i])
    for j in range(0, 40):
        indicators[i][j] /= maxValue

for i in range(0, 40):
    row = []
    for j in range(0, 10):
        row.append(indicators[j][i])
    contryTrainingVector.append([row, [0]])

print(f"contryTrainingVector = {contryTrainingVector}")
print(f"countryTag = {countryTag}")


contryTrainingVector = [[[0.998116836547852, 0.16261313292295995, 0.019465642472762494, 0.006006171163250178, 0.031455175682968455, 0.029844338862892294, 0.033155854778470266, 0.1543155446956389, 0.07797223042578637, 0.03709110094970102], [0]], [[0.91522822, 0.06425101984121181, 0.00020191219359179255, 0.005630344109215299, 0.007927051822464667, 0.007758251071918063, 0.008105267209760104, 0.31654207745118285, 0.24531958785620767, 0.025043967639817095], [0]], [[0.99710902, 0.4980492155629962, 0.12340914203620469, 0.047683824246398145, 0.14911666909759191, 0.14293663143680244, 0.15564139011720565, 0.2478690019043054, 0.09301575046591117, 0.2565951459725642], [0]], [[1.0, 0.5778222917550042, 0.14077861407070308, 0.010965519264481773, 0.026037330260643806, 0.02513963095779101, 0.026985097544940252, 0.20924008263317787, 0.22712072315169946, 0.023742525501231095], [0]], [[0.99714844, 0.044220665857617004, 0.014041083712651274, 0.003722309781054483, 0.013104648415279824, 0.012559875983152913,