Note: change the path specifications before using....


In [None]:
import numpy as np
import pandas as pd
import os
from IPython.core.debugger import Tracer
import time
import sys

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer

# this is where the data comes from
datapath = "/home/newuser/Dokumente/programming_project/Predictor/data"

# labels needed to generate standardized output
labels = ["Datum", "CDU", "SPD", "Gruene", "FDP", "Linke", "AfD", "Sonstige","Befragte", "Institute"]
institutes = ["forsa", "emnid", "allensbach"]




In [None]:
# here we read in the latest predictions from "forsa", "emnid" and "allensbach" and write them 
# to a .csv - file (called "helper_simple_regression.csv"). This procedure might not be optimal 
# ... but it works. 

test_latest = open("helper_simple_regression.csv", "w")

for i, lab in enumerate(labels):
    test_latest.write(lab)
    if i < len(labels): 
        test_latest.write(',')
test_latest.write('\n')

for k, inst in enumerate(institutes): 
    name = inst + ".csv"
    file = os.path.join(datapath, name)
    datafile = pd.read_csv(file, encoding="ISO-8859-1")
    for i in range(len(labels)-1):
        test_latest.write(str(datafile[labels[i]][0]))
        test_latest.write(',')
    test_latest.write(inst)
    test_latest.write('\n')
        
test_latest.close()  



In [None]:
# Here we open the above generated "helper_simple_regression.csv" as a pandas dataframe. 
# The csv is not used further. 

latest = pd.read_csv("/home/newuser/Dokumente/programming_project/Predictor/models/helper_simple_regression.csv",  encoding="ISO-8859-1")      

In [None]:
def generate_output_csv(modelname):
    """This function takes the name of a model as argument and generates a .csv-file with 
    labelled columns where all further predictions from that model will be stored. The 
    structure of the csv is described in "model_output_structure.txt" """
   
    filename = modelname + ".csv"
    
    labels = ["modelname", "prediction_date", "CDU", "SPD", "Gruene", "FDP", "Linke", "AfD", "Sonstige", 
              "forsa", "emnid", "allensbach", "startdate", "enddate"]
    
    out = open(filename, "w")
    
    for i, label in enumerate(labels): 
        if i < len(labels):
            out.write(label + " , ")
    out.write("\n")
    out.close()


In [None]:
def add_to_output_csv(modelname, date, data, forsa, emnid, allensbach, start, end): 
    """ This function is called after each prediction and adds a line to the .csv-file 
    corresponding to the model that was used for prediction. The files containing prediction 
    data are named "modelname.csv" and their structure  is described in the 
    "model_output_structure.txt" file. """
    
    filename = modelname + ".csv"
    
    if os.path.isfile(filename): #check if the current model already has a file for predictions
        out = open(filename,'a')
    else: # if no prediction file exists one is generated using the function "generate_output_csv"
        generate_output_csv(modelname) 
        out  = open(filename,'a')
        
    # here all relevant information aout the new prediction is appended
    out.write(modelname + ",")   
    out.write(date + ",")
    for i in range(len(data)): 
        out.write(str(data[i][0]) + ",")
    out.write(str(forsa) + ",")
    out.write(str(emnid) + ",")
    out.write(str(allensbach) + ",")
    out.write(str(start) + ",")
    out.write(str(end) + ",")
    out.write("\n")

In [None]:
def simple_model(dataframe):
    """ This model takes a a dataframe containing the latest predictions from different
    institutes. The model calculates a weighted average of these prediction the weights
    being the number of participants in the survey. """
        
    prediction = np.zeros((7,1))
    labs =  dataframe.columns
    party_lables = labs[1: 7]
    weights = dataframe["Befragte"]
    total = sum(weights)
    for k, pl in enumerate(party_lables): 
        prediction[k] = sum(dataframe[pl] * weights) / total
    
    modelname = sys._getframe().f_code.co_name
    date = time.strftime("%x")
    start = min(dataframe['Datum'])
    end = max(dataframe['Datum'])
    
    add_to_output_csv(modelname, date, prediction, 1, 1, 1, start, end)


In [None]:
simple_model(latest)

In [None]:
def linear_regression(dataframe, depth, parties):    
    """ This model performs linear regression.
    
    Note: currently this only makes sense to use for data from one institute, 
    else the dates of surveys might not be equally spaced. 
    
    Input:
        datafile ..... pandas file containing equally spaced survey data
        depth ........ number of data point to be taken into account
        parties....... list of strings containing the parties to be taken into account
        
    Output: 
        predictions... np.array with predictions for each party
    """
    
    # create np.array conatining the predictions for selected parties 
    # shape = predictions * parties
    data_for_regression = dataframe.as_matrix(parties)
    data_for_regression = data_for_regression[:depth, :] #choose only the latest datapoints
    
    biases = np.zeros(len(parties))
    coeffs = np.zeros(len(parties))
    
    prediction = np.zeros((len(parties),1))
    
    for p in range(len(parties)):
        y = data_for_regression[:,p]
        y = y[~np.isnan(y)]
        
        x = np.arange(0, len(y), 1)
    
        lm = LinearRegression()
        lm.fit(x.reshape(-1,1), y)

        biases[p] = lm.intercept_
        coeffs[p] = lm.coef_ 
        
        prediction[p] = len(y)*coeffs[p] + biases[p]
        
        modelname = sys._getframe().f_code.co_name + " depth =" + str(depth)
        date = time.strftime("%x")
        start = min(dataframe['Datum'])
        end = max(dataframe['Datum'])
    
    add_to_output_csv(modelname, date, prediction, 1, 0, 0, start, end)


In [None]:

data_forsa = pd.read_csv("/home/newuser/Dokumente/programming_project/Predictor/data/forsa.csv")
p = ["CDU", "SPD", "Gruene", "FDP", "Linke", "AfD", "Sonstige"]
depth = 20
linear_regression(data_forsa, depth, p )