In [39]:
import numpy as np
import csv
import gc
import re

In [40]:
def read_data(filename,yes_no_array):
    '''return an array of cleaned data from a csv file made by the write_data function.
    The order of the columns is the following: 
    Title | Category of the participant | Questions | Output
    INPUT: name of the file, position of the integer columns
    OUTPUT: matrix of data'''
    data = []
    with open(filename, newline='',encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=';')
        for row in reader:
            element = []
            for i in range(len(row)-1):
                if i in yes_no_array:
                    element.append(int(row[i]))
                else: element.append(read_dictionary(row[i]))
            element.append(row.pop())
            data.append(element)
        #  call the garbage collector
        gc.collect()
    return data

def read_word_count(filename):
    '''This function reads a file which contains arrays of tuples of key and number of occurrences
    of a word, and returns an array of dictionaries with the tuples
    INPUT: name of the file to upload
    OUTPUT: dictionnary with the words of the file'''
    words = []
    with open(filename, newline='',encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=';')
        for row in reader:
            dictionary = dict()
            for i in row:
                dictionary.update(read_dictionary(i))
            words.append(dictionary)
        #  call the garbage collector
        gc.collect()
    return words

def read_dictionary(dictionary_string):
    '''return a dictionary from a string 
    INPUT: an string of many key-value tuples
    OUTPUT: a dictionary'''
    dictionary = dict()
    clean_dict = re.sub(r'[ \[ | \] | \( | \) | \' | { | } ]','', dictionary_string)
    parsed_dict = re.split(r'[,|:]', clean_dict)
    for i in range(int(len(parsed_dict)/2)):
        key = parsed_dict[2*i]
        value = int(parsed_dict[2*i+1])
        dictionary[key] = value
    return dictionary

def sort_dictionary(dictionary):
    '''returns a sorted dictionary (as tuples) based on the value of each key'''
    return sorted(dictionary.items(), key=lambda x: x[1], reverse=True)

def normalize_counts(counts):
    '''returns the frequency of tokens for each text'''
    total = sum(counts.values())
    return dict((word, float(count)/total) for word,count in counts.items())

In [41]:
def find_zip_codes_by_town(density_threshold,filename='city_information.tsv'):
    '''This function splits up the zip codes of a file into two categories according to a density threshold
    INPUT: density threshold, file name of the reference library
    OUTPUT: set of zip codes for cities, a set of zip codes for villages'''
    codes_cities = set()
    tmp_code_cities = set()
    codes_villages = set()
    tmp_code_villages = set()
    with open(filename, newline='',encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader)
        for row in reader:
            # We keep only the questions with a valid ID and a valid number of rows
            density = -1
            if row[3] != '' and row[4] != '':
                density = float(row[3]) / float(row[4])
            if density >= density_threshold:
                codes_cities.add(row[1])
            elif density != -1: codes_villages.add(row[1])
    tmp_code_cities = codes_cities.copy()
    tmp_code_villages = codes_villages.copy()
    for code in tmp_code_cities:
        match = re.search('-',code)
        if (match != None):
            codes_cities.remove(code)
            split_codes = re.split('-', code)
            for i in split_codes:
                codes_cities.add(i)
    for code in tmp_code_villages:
        match = re.search('-',code)
        if (match != None):
            codes_villages.remove(code)
            split_codes = re.split('-', code)
            for i in split_codes:
                codes_villages.add(i)
    #  call the garbage collector
    gc.collect()
    return codes_cities,codes_villages

def city_village_classifier(density_threshold,data):
    '''This function tags the entries of a data set according to their population density. If it is more than
    a given threshold, the label is 1, -1 otherwise
    INPUT: the density threshold, a dataset
    OUTPUT: the classified dataset'''
    city_zip_codes , village_zip_codes = find_zip_codes_by_town(density_threshold)
    classified_data = []
    class_vector = []
    for entry in data:
        if entry[-1] in city_zip_codes:
            classified_data.append(entry)
            class_vector.append(1)
        elif entry[-1] in village_zip_codes: 
            classified_data.append(entry)
            class_vector.append(-1)
    classified_data = np.array(classified_data)
    classified_data = np.delete(classified_data,len(classified_data[0])-1,1)
    class_vector = np.array(class_vector).reshape((len(class_vector),1))
    classified_data = np.append(classified_data,class_vector,axis=1)
    #  call the garbage collector
    gc.collect()
    return classified_data

In [42]:
def word_count_by_question(data,yes_no_questions):
    '''This function counts all the words of a data set by question(column) and arrange them into a dictionnary
    INPUT:  dataset, the numbers columns of integers
    OUTPUT: an array of dictionaries of the counted words
    '''
    word_count = []    
    for i in range(len(data[0])-1):
        if i not in yes_no_questions:
            word_count.append(Counter())
    j = 0
    for i in range(len(data[0])-1):
        if i not in yes_no_questions:
            for entry in column(data,i): word_count[j] += Counter(entry)
            j += 1
    #  call the garbage collector
    gc.collect()
    return word_count

def word_count_total(dictionary_array):
    '''This functions sums up all the words of a many texts represented by dictionaries
    INPUT: an array of dictionaries
    OUTPUT a dictionary with the count of all the words:
    '''
    total_words = Counter()
    for question in dictionary_array: total_words += Counter(question)
    return total_words



In [43]:
def get_most_used_words(data, yes_no_array, word_count_array, number_of_words):
    '''This function compares for each entry and question the number of ocurrences of the most used
    words in the respective question. This function limits the number of most used words in order to normalized
    the output
    INPUT: a data set, the answers whose response is an integer, an array with the total of words by questions
    the maximum number of words to keep, 
    OUTPUT: the filtered dataset
    '''
    most_used_words = []
    for question in word_count_array:
        words_by_question = []
        sorted_word_array = sort_dictionary(question)
        for i in range(number_of_words):
            words_by_question.append(sorted_word_array[i])
        most_used_words.append(words_by_question)
    
    filtered_data = []
    
    for entry in data:
        entry_array = []
        j = 0
        for i in range(len(entry)-1):
            if i in yes_no_array:
                entry_array.append(entry[i])
            else:
                words_rep_array = []
                for word in most_used_words[j]:
                    try:
                        words_rep_array.append(entry[i][word[0]])
                    except KeyError as error:
                        words_rep_array.append(0)
                entry_array.append(words_rep_array)
                j += 1
        entry_array.append(entry[-1])
        filtered_data.append(entry_array)
    
    #  call the garbage collector
    gc.collect()
        
    return filtered_data

def get_set_features(data,column):
    '''This function return a feature (column) of a dataset
    '''
    a_data = np.array(data)
    features = a_data[:,column]
    features = np.array([np.asarray(i) for i in features])
    return features

def get_total_most_used_words(data, yes_no_array, word_count_array, number_of_words,by_question=True):
    '''This function compares for each entry and question the number of ocurrences of the most used
    words in the respective question. This function limits the number of most used words in order to normalized
    the output
    INPUT: a data set, the answers whose response is an integer, an array with the total number of words
    the maximum number of words to keep, 
    OUTPUT: the filtered dataset
    '''
    most_used_words = []
    sorted_word_array = sort_dictionary(word_count_array)
    for i in range(number_of_words):
        most_used_words.append(sorted_word_array[i])
    
    filtered_data = []
    for entry in data:
        entry_array = []
        j = 0
        entry_words = word_count_by_question([entry],yes_no_array)
        entry_words= word_count_total(entry_words)
        words_rep_array = []
        for word in most_used_words:
            try:
                words_rep_array.append(entry_words[word[0]])
            except KeyError as error:
                words_rep_array.append(0)
        entry_array.append(words_rep_array)
        j += 1
        entry_array.append(entry[-1])
        filtered_data.append(entry_array)

    #  call the garbage collector
    gc.collect()
        
    return filtered_data

In [44]:
data = read_data('data.csv',[1,4,6,10,12])
classified_data = city_village_classifier(20344,data)


In [45]:
g = read_word_count('word_count_by_question.csv')
h = read_word_count('word_count_total.csv')

In [46]:
filtered_data = get_most_used_words(classified_data,[1,4,6,10,12],g,5)

In [47]:
get_set_features(filtered_data,2)

array([[0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0],
       [1, 1, 1, 1, 0],
       ...,
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [1, 1, 1, 1, 0]])