# Parsing replies from Component 8

This notebook constists mostly in code adapted from Short_version_C8_Longterm_Tracking_online.ipynb,

and produces a csv file containing wave code, time window, answer key, raw text, lfdn and human labels for all responses in every wave.

In [1]:
import pandas as pd
from pandas.io.stata import StataReader as sr
import re
import matplotlib.pyplot as plt
from datetime import datetime as dt
from glob import glob
import seaborn as sns
import numpy as np

import seaborn as sns
sns.set_context('poster')

# Load data

In [2]:
# Set this to your local VW gdrive folder

local_folder='.'

In [3]:
# Load all datasets/waves as pandas dataframes and  pandas StataReader objects
filenames = glob(local_folder+'/Komponente 8/*.dta')
dataframes = [pd.read_stata(i, convert_categoricals=False) for i in filenames]
labels = [pd.read_stata(i, iterator=True) for i in filenames]

In [4]:
# Print examples of responses for a given wave 'idx'

idx = 17
df = dataframes[idx]
row = df.loc[0]
for k in row.keys():
    print(k.ljust(30)+str(row[k]))

study                         5340
version                       4.0.0 (2013-06-27)
year                          2009
field                         2009-09-29 bis 2009-10-08
ex_field                      2009-10-08 bis 2009-10-25
glescomp                      10
survey                        7
lfdn                          36
datetime                      2009-09-29 12:32:53
lastaccs                      2009-09-29 12:48:21
dauer                         928
status                        2
mut_sex                       0
mut_age                       0
mut_educ                      0
mutation                      0
gew1_t7_v0                    0.449
gew2_t7_v0                    0.237
gew1_t7_v1                    nan
gew2_t7_v1                    nan
gew1_t7_v2                    nan
gew2_t7_v2                    nan
v_01                          1
v_02                          1
v_03                          6
T7_01                         10
a01                           2
a02     

In [5]:
idx = 17
df = dataframes[idx]

keys = [ key for (key, val) in labels[idx].variable_labels().items() if 'Wichtigstes Problem' in val ]
df.filter(keys).head()

Unnamed: 0,T_A04,a03_c2,a04,T_A03,a03,T29_A04,a04c,T28_A03,a03_c3,a03_c1,a03_c4
0,16,10000,5,9,Arbeitslosigkeit,172,5,156,10000,3810,10000
1,0,10000,1000,2,99,0,1000,156,10000,9999,10000
2,8,10000,808,28,Die Arbeislosigkeit,396,809,388,10000,3810,10000
3,2,10000,7,10,Wirtschaftliche Lage,121,7,119,10000,3910,10000
4,0,10000,1000,3,99,0,1000,278,10000,9999,10000


In [6]:
#get labels of variables concerning most important problem
vardict_vls = [pd.read_stata(i, iterator=True) for i in filenames]


for l in labels:
    print(dataframes[labels.index(l)].loc[0, 'study'])
    
    vardict = l.variable_labels()
    for key, val in vardict.items():
       
        if 'Wichtigstes Problem' in val:
            print(key, ': ', val)           
            
    print('\n')

5732
t11a :  Wichtigstes Problem, Loesungskompetenz (Version A)
t10c1 :  Wichtigstes Problem (Codierung 1)
t10c4 :  Wichtigstes Problem (Codierung 4)
t10c2 :  Wichtigstes Problem (Codierung 2)
t11b :  Wichtigstes Problem, Loesungskompetenz (Version B)
t10c6 :  Wichtigstes Problem (Codierung 6)
t10c3 :  Wichtigstes Problem (Codierung 3)
t10c5 :  Wichtigstes Problem (Codierung 5)


5336
t_A03 :  Zeit_Seite_A03 Wichtigstes Problem
a04 :  Wichtigstes Problem, Lösungskompetenz (Version A)
a03_c :  Wichtigstes Problem (Codierung)
a04c :  Wichtigstes Problem, Lösungskompetenz (Version B)
t47_A03 :  Zeit_kumuliert_A03 Wichtigstes Problem
t48_A04 :  Zeit_kumuliert_A04 Wichtigstes Problem, Lösungskompetenz
t_A04 :  Zeit_Seite_A04 Wichtigstes Problem, Lösungskompetenz


5343
T11_A03 :  Zeit_kumuliert_A03 _Wichtigstes Problem
T_A04 :  Zeit_Seite_A04 _Wichtigstes Problem, Loesungskompetenz
E27b :  Wichtigstes Problem, Problemloesung
T_E27b :  Zeit_Seite_E27b _Wichtigstes Problem, Bund, Problemloesu

In [7]:
#store variable names belonging to the first coding of the issue question in a dictionary for each wave
issue_comp8 = {}

for l in labels:
    v_l = l.variable_labels()
    for key, val in v_l.items():
        if val == 'Wichtigstes Problem (1. Codierung)' or val == 'Wichtigstes Problem (Codierung)' or val == 'Wichtigstes Problem: 1. Nennung' or val == 'Wichtigstes Problem: 1. Codierung' or val == 'Wichtigstes Problem (Codierung 1)':
            issue_comp8[labels.index(l)] = key
            
assert len(issue_comp8) == len(filenames)

In [8]:
# correction for wrong end date in first wave
# for-loop as file order can vary by system
for df in dataframes:
    if np.any(df.field=="2009-04-30 bis 2011-05-12"):
        df.field = '2009-04-30 bis 2009-05-12'
        break

In [9]:
#store variable name belonging to all codings of the issue question in a dictionary for each wave
issue_comp8_all_mentions = {}

for l in labels:
    v_l = l.variable_labels()
    issue_comp8_all_mentions[labels.index(l)] = []
    for key, val in v_l.items():
        if val == 'Wichtigstes Problem (1. Codierung)' or val == 'Wichtigstes Problem (2. Codierung)' or val == 'Wichtigstes Problem (3. Codierung)' or val == 'Wichtigstes Problem (4. Codierung)' or val == 'Wichtigstes Problem (5. Codierung)' or val == 'Wichtigstes Problem (6. Codierung)' or val == 'Wichtigstes Problem (Codierung)' or val == 'Wichtigstes Problem: 1. Nennung' or val == 'Wichtigstes Problem: 2. Nennung' or val == 'Wichtigstes Problem: 3. Nennung' or val == 'Wichtigstes Problem: 4. Nennung' or val == 'Wichtigstes Problem: 5. Nennung' or val == 'Wichtigstes Problem: 6. Nennung' or val == 'Wichtigstes Problem: 7. Nennung' or val == 'Wichtigstes Problem: 8. Nennung' or val == 'Wichtigstes Problem: 1. Codierung' or val == 'Wichtigstes Problem: 2. Codierung' or val == 'Wichtigstes Problem: 3. Codierung' or val == 'Wichtigstes Problem: 4. Codierung' or val == 'Wichtigstes Problem: 5. Codierung' or val == 'Wichtigstes Problem: 6. Codierung' or val == 'Wichtigstes Problem: 7. Codierung' or val == 'Wichtigstes Problem: 8. Codierung' or val == 'Wichtigstes Problem: 9. Codierung' or val == 'Wichtigstes Problem: 10. Codierung' or val == 'Wichtigstes Problem: 11. Codierung' or val == 'Wichtigstes Problem: 12. Codierung' or val == 'Wichtigstes Problem (Codierung 1)' or val == 'Wichtigstes Problem (Codierung 2)' or val == 'Wichtigstes Problem (Codierung 3)' or val == 'Wichtigstes Problem (Codierung 4)' or val == 'Wichtigstes Problem (Codierung 5)'or val == 'Wichtigstes Problem (Codierung 6)':
            issue_comp8_all_mentions[labels.index(l)].append(key)
            
assert len(issue_comp8_all_mentions) == len(filenames)

In [10]:
# store the names of the variables concerning the voting districts/federal states for each wave in a dictionary
elecdist_comp8 = {}
ec8 = {}
for l in labels:
    v_l = l.variable_labels()
    for key, val in v_l.items():
        if key == 'elecdist' or key == 'wahlkreis' or key == 'Buland':
            elecdist_comp8[dataframes[labels.index(l)].loc[0, 'study']] = key
            ec8[labels.index(l)] = key          
            
assert len(ec8) == len(filenames) == len(dataframes)

In [11]:
#define a function to upscale the voting districts to the level of federal states
def get_states(row, zeile):
    if zeile == 'wahlkreis':
        if 1 <= row <= 11:
            return 'SH'
        elif 12 <= row <= 17:
            return 'MV'
        elif 18 <= row <= 23:
            return('HH')
        elif 24 <= row <= 53:
            return('NI')
        elif 54 <= row <= 55:
            return('HB')
        elif 56 <= row <= 65:
            return('BB')
        elif 66 <= row <= 74:
            return('ST')
        elif 75 <= row <= 86:
            return('BE')
        elif 87 <= row <= 150:
            return('NW')
        elif 151 <= row <= 166:
            return('SN')
        elif 167 <= row<= 188:
            return('HE')
        elif 189 <= row <= 196:
            return('TH')
        elif 197 <= row <= 211:
            return('RP')
        elif 212 <= row <= 257:
            return('BY')
        elif 258 <= row <= 295:
            return('BW')
        elif 296 <= row <= 299:
            return('SL')
        else:
            return(np.nan)
    
    elif zeile == 'elecdist':
        if 1 <= row <= 11:
            return('SH')
        elif 12 <= row <= 17:
            return('MV')
        elif 18 <= row <= 23:
            return('HH')
        elif 24 <= row <= 53:
            return('NI')
        elif 54 <= row <= 55:
            return('HB')
        elif 56 <= row <= 65:
            return('BB')
        elif 66 <= row <= 74:
            return('ST')
        elif 75 <= row <= 86:
            return('BE')
        elif 87 <= row <= 150:
            return('NW')
        elif 151 <= row <= 166:
            return('SN')
        elif 167 <= row <= 188:
            return('HE')
        elif 189 <= row <= 196:
            return('TH')
        elif 197 <= row <= 211:
            return('RP')
        elif 212 <= row <= 257:
            return('BY')
        elif 258 <= row <= 295:
            return('BW')
        elif 296 <= row <= 299:
            return('SL')
        else:
            return(np.nan)
    
    elif zeile == 'Buland':
        if row == 1:
            return('BW')
        elif row == 2:
            return('BY')
        elif row == 3:
            return('BE')
        elif row == 4:
            return('BB')
        elif row == 5:
            return('HB')
        elif row == 6:
            return('HH')
        elif row == 7:
            return('HE')
        elif row == 8:
            return('MV')
        elif row == 9:
            return('NI')
        elif row == 10:
            return('NW')
        elif row == 11:
            return('RP')
        elif row == 12:
            return('SL')
        elif row == 13:
            return('SN')
        elif row == 14:
            return('ST')
        elif row == 15:
            return('SH')
        elif row == 16:
            return('TH')
        else:
            return(np.nan)
    else:
        return(np.nan)

In [12]:
#join all columns regarding the issue question of all dataframes in one dataframe

issue_df = pd.DataFrame(columns=['field','issue_code', 'elecdist', 'lfdn'])
for key, val in issue_comp8_all_mentions.items():
    for v in val:
        df = dataframes[key].loc[:, ['study', 'field', v, 'lfdn']]
        df['study'] = df['study'].apply( lambda study: 'ZA'+str(int(study)) )         
        df['elecdist'] = dataframes[key].loc[:, ec8[key]]
        df['state'] = dataframes[key].loc[:, ec8[key]].apply(get_states, zeile=ec8[key])
        df.columns = ['wave_code', 'field', 'issue_code','lfdn', 'elecdist', 'state']
        issue_df = issue_df.append(df)
        
issue_df.head()

Unnamed: 0,elecdist,field,issue_code,lfdn,state,wave_code
0,50,2016-06-03 bis 2016-06-17,3753,1,NI,ZA5732
1,267,2016-06-03 bis 2016-06-17,3753,2,BW,ZA5732
2,52,2016-06-03 bis 2016-06-17,2438,3,NI,ZA5732
3,209,2016-06-03 bis 2016-06-17,3700,4,RP,ZA5732
4,168,2016-06-03 bis 2016-06-17,3753,5,HE,ZA5732


In [13]:
#convert code numbers into categories in the way Arnim et al. did it in BTW2013 paper

def get_issue_category_asinbtw13(row, zeile):
    if str(row)[:1] == '1' and len(str(row)) <=4:
        return('Politics')
    elif str(row)[:1] == '2':
        return('Polity')
    elif str(row)[:3] == '431':
        return('Budget and Debt')
    elif str(row)[:3] == '433' :
        return('Currency and Euro')
    elif str(row)[:2] in ['39' , '40']:
        return('Economy')
    elif str(row)[:2] == '41' :
        return('Education')
    elif str(row)[:2] == '36' :
        return('Environment')
    elif str(row)[:3] == '371' :
        return('Family Policy')
    elif str(row)[:3] in ['310','310', '312', '316', '317', '318', '329'  '330', '331', '332', '333', '339']:
         return('Foreign Policy (Defense)')    
    elif str(row)[:3] == '311' :
        return('Foreign Policy (Europe)')    
    elif str(row)[:3]  in ['430' ,'439']:
        return('General Fiscal Policy')    
    elif str(row)[:3]  in ['370', '372', '373', '378', '379'] :
        return('General Social Policy')    
    elif str(row)[:3]  in ['374' ,'376','377'] :
        return('Health Care and Pensions')      
    elif str(row)[:2] == '35' :
        return('Infrastructure')    
    elif str(row)[:2] == '38' :
        return('Labor Market')    
    elif str(row)[:2] == '34' :
        return('Law and Order')    
    elif str(row)[:3] == '375' :
        return('Migration and Integration')     
    elif str(row)[:3] == '432' :
        return('Taxes') 
    else:
        return(np.nan)

In [14]:
issue_df['issue_code'] = issue_df['issue_code'].apply(get_issue_category_asinbtw13, zeile='issue_code')

issue_df['issue_code'] = issue_df['issue_code'].str.lower()

pd.to_numeric(issue_df.issue_code.value_counts().index, errors='coerce').unique().sort_values()#[issue_df.issue_code.value_counts()]

issue_df['field'] = issue_df.field.str.replace('bis', 'to')

In [15]:
issue_df['year'] = issue_df.field.map(lambda x: re.match('\d\d\d\d', x).group(0))
issue_df['start_of_wave'] = issue_df.field.map(lambda x: re.match('\d\d\d\d\-\d\d\-\d\d', x).group(0))
issue_df['start_of_wave'] = pd.to_datetime(issue_df['start_of_wave'])

In [16]:
issue_df.issue_code.unique()

array(['migration and integration', 'polity', 'general social policy',
       'infrastructure', 'politics', 'education', 'law and order',
       'foreign policy (europe)', nan, 'labor market',
       'health care and pensions', 'economy', 'foreign policy (defense)',
       'budget and debt', 'currency and euro', 'general fiscal policy',
       'environment', 'taxes', 'family policy'], dtype=object)

In [17]:
issue_df.to_csv('data/C8_issue_df.csv',index=False)
issue_df.head()

Unnamed: 0,elecdist,field,issue_code,lfdn,state,wave_code,year,start_of_wave
0,50,2016-06-03 to 2016-06-17,migration and integration,1,NI,ZA5732,2016,2016-06-03
1,267,2016-06-03 to 2016-06-17,migration and integration,2,BW,ZA5732,2016,2016-06-03
2,52,2016-06-03 to 2016-06-17,polity,3,NI,ZA5732,2016,2016-06-03
3,209,2016-06-03 to 2016-06-17,general social policy,4,RP,ZA5732,2016,2016-06-03
4,168,2016-06-03 to 2016-06-17,migration and integration,5,HE,ZA5732,2016,2016-06-03


# Now merge with raw text

In [18]:
missing_raw_text_filenames = glob(local_folder+'/issue_item_raw_text_if_missing_in_dataset/*.csv')
missing_raw_text_filecodes = [ fn.split('/')[-1][:6] for fn in missing_raw_text_filenames ]

missing_raw_text_filenames

['./issue_item_raw_text_if_missing_in_dataset/ZA6815_v1-0-0_open-ended.csv',
 './issue_item_raw_text_if_missing_in_dataset/ZA5734_open-ended_v1-0-0.csv',
 './issue_item_raw_text_if_missing_in_dataset/ZA5732_open-ended_v1-0-0.csv',
 './issue_item_raw_text_if_missing_in_dataset/ZA6817_v1-0-0_open-ended.csv',
 './issue_item_raw_text_if_missing_in_dataset/ZA6816_v1-0-0_open-ended.csv',
 './issue_item_raw_text_if_missing_in_dataset/ZA5733_open-ended_v1-0-0.csv',
 './issue_item_raw_text_if_missing_in_dataset/ZA5731_open-ended_v1-1-0.csv',
 './issue_item_raw_text_if_missing_in_dataset/ZA5720_v3-0-0_open-ended.csv']

In [19]:
missing_raw_text = { filecode:pd.read_csv(filename, sep=';', encoding='latin1') \
                     for filecode,filename in zip(missing_raw_text_filecodes, missing_raw_text_filenames) }

missing_raw_text[missing_raw_text_filecodes[0]].head()

Unnamed: 0,lfdn,t10s,t12s
0,935,Vertrauensverlust und Desinteresse,Soziale Gerechtigkeit
1,346,Schulbildung,Integration und Zuwanderung
2,747,Sozialer zusammenhalt,Popolismus
3,525,Stabilisierung Europas,Flüchtlingspolitik
4,443,Flüchtlingspolitik,"Rechtsruck in der Politik ""AfD-Wähler"""


In [20]:
# See which waves have a field called 'Wichtigstes Problem'

out_strs = []
for filename,l in zip(filenames,labels):
    out_str = dataframes[labels.index(l)].loc[0, 'field'].replace('bis','to')+'-'+filename[15:]
    
    vardict = l.variable_labels()
    for key, val in vardict.items():
        if 'Wichtigstes Problem' == val:
            out_str += ' - '+str(key)+': '+str(val) 
    if filename.split('/')[-1][:6] in missing_raw_text_filecodes:
        out_str += ' - '+'text in '+filename.split('/')[-1][:6]+'.csv'
    out_strs += [out_str]

out_strs = sorted(out_strs)
out_strs

['2009-04-30 to 2009-05-12-ZA5334_v3-0-0.dta',
 '2009-05-27 to 2009-06-05-ZA5335_v4-0-0.dta',
 '2009-07-03 to 2009-07-13-ZA5336_v4-0-0.dta',
 '2009-07-31 to 2009-08-11-ZA5337_v3-0-0.dta',
 '2009-08-24 to 2009-09-01-ZA5338_v3-1-0.dta',
 '2009-09-18 to 2009-09-27-ZA5339_v3-0-0.dta - a03s: Wichtigstes Problem',
 '2009-09-29 to 2009-10-08-ZA5340_v4-0-0.dta - a03: Wichtigstes Problem',
 '2009-12-10 to 2009-12-20-ZA5341_v2-0-0.dta - A03s: Wichtigstes Problem',
 '2010-04-15 to 2010-04-23-ZA5342_v2-0-0.dta - A03s: Wichtigstes Problem',
 '2010-06-24 to 2010-07-05-ZA5343_v2-0-0.dta - A03s: Wichtigstes Problem',
 '2010-09-16 to 2010-09-26-ZA5344_v2-0-0.dta - A03s: Wichtigstes Problem',
 '2010-12-09 to 2010-12-19-ZA5345_v2-0-0.dta - A03s: Wichtigstes Problem',
 '2011-03-09 to 2011-03-19-ZA5346_v2-1-0.dta - A03s: Wichtigstes Problem',
 '2011-05-23 to 2011-06-03-ZA5347_v2-0-0.dta - A03s: Wichtigstes Problem',
 '2011-08-24 to 2011-09-03-ZA5348_v2-0-0.dta - a03s: Wichtigstes Problem',
 '2011-12-08 to 

In [21]:
# Show labels of variables concerning most important problem for wave 'idx'
vardict_vls = [pd.read_stata(i, iterator=True) for i in filenames]

l = labels[idx]
print(dataframes[labels.index(l)].loc[0, 'study'])

vardict = l.variable_labels()
for key, val in vardict.items():
    if 'Wichtigstes Problem' in val:
        print(key, ': ', val)           
        
keys = [ key for (key, val) in labels[idx].variable_labels().items() if 'Wichtigstes Problem' in val ]

dataframes[labels.index(l)].filter(keys).head()

5340
T_A04 :  Zeit_Seite_A04_Loesungskompetenz: Wichtigstes Problem
a03_c2 :  Wichtigstes Problem (2. Codierung)
a04 :  Loesungskompetenz: Wichtigstes Problem (Version A)
T_A03 :  Zeit_Seite_A03_Wichtigstes Problem
a03 :  Wichtigstes Problem
T29_A04 :  Zeit_kumuliert_A04_Loesungskompetenz: Wichtigstes Problem
a04c :  Loesungskompetenz: Wichtigstes Problem (Version B)
T28_A03 :  Zeit_kumuliert_A03_Wichtigstes Problem
a03_c3 :  Wichtigstes Problem (3. Codierung)
a03_c1 :  Wichtigstes Problem (1. Codierung)
a03_c4 :  Wichtigstes Problem (4. Codierung)


Unnamed: 0,T_A04,a03_c2,a04,T_A03,a03,T29_A04,a04c,T28_A03,a03_c3,a03_c1,a03_c4
0,16,10000,5,9,Arbeitslosigkeit,172,5,156,10000,3810,10000
1,0,10000,1000,2,99,0,1000,156,10000,9999,10000
2,8,10000,808,28,Die Arbeislosigkeit,396,809,388,10000,3810,10000
3,2,10000,7,10,Wirtschaftliche Lage,121,7,119,10000,3910,10000
4,0,10000,1000,3,99,0,1000,278,10000,9999,10000


## Get all text entries of every wave

In [22]:
raw_texts = []

for filename,l in zip(filenames,labels):
    time_window = dataframes[labels.index(l)].loc[0, 'field'].replace('bis','to')
    fn = filename[15:21]
    out_str = time_window+'-'+fn
    
    vardict = l.variable_labels()
    for key, val in vardict.items():
        if 'Wichtigstes Problem' == val:
            out_str += ' - '+str(key)+': '+str(val)
            print('Processing: '+out_str)
            
            df = dataframes[labels.index(l)].filter(['lfdn',key])#.set_index('lfdn')
            raw_texts += [( time_window, fn, key, df, False )]
            
    if filename.split('/')[-1][:6] in missing_raw_text_filecodes:
        out_str += ' - '+'text in '+filename.split('/')[-1][:6]+'.csv'
        print('Processing: '+out_str)
        
        df = missing_raw_text[fn].filter(['lfdn','t10s'])#.set_index('lfdn')
        raw_texts += [( time_window, fn, key, df, True )]

raw_texts.sort()

Processing: 2016-06-03 to 2016-06-17-ZA5732 - text in ZA5732.csv
Processing: 2010-06-24 to 2010-07-05-ZA5343 - A03s: Wichtigstes Problem
Processing: 2017-03-17 to 2017-03-31-ZA6815 - text in ZA6815.csv
Processing: 2012-05-02 to 2012-05-15-ZA5350 - t10s: Wichtigstes Problem
Processing: 2013-09-06 to 2013-09-21-ZA5721 - t10s: Wichtigstes Problem
Processing: 2015-09-11 to 2015-09-25-ZA5729 - t10s: Wichtigstes Problem
Processing: 2015-02-27 to 2015-03-13-ZA5727 - t10s: Wichtigstes Problem
Processing: 2010-12-09 to 2010-12-19-ZA5345 - A03s: Wichtigstes Problem
Processing: 2016-12-02 to 2016-12-16-ZA5734 - text in ZA5734.csv
Processing: 2016-02-26 to 2016-03-11-ZA5731 - text in ZA5731.csv
Processing: 2017-09-12 to 2017-09-23-ZA6817 - text in ZA6817.csv
Processing: 2010-09-16 to 2010-09-26-ZA5344 - A03s: Wichtigstes Problem
Processing: 2009-12-10 to 2009-12-20-ZA5341 - A03s: Wichtigstes Problem
Processing: 2013-11-29 to 2013-12-13-ZA5722 - t10s: Wichtigstes Problem
Processing: 2009-09-29 to 2

In [23]:
# Make single dataset with all responses
df_all_raw_texts = pd.DataFrame()
for tup in raw_texts:
    time_window, fn, key, df, miss = tup
    df.columns.values[1] = 'text'
    df['time_window'] = time_window
    df['wave_code']   = fn
    df['answer_key']  = key
    # df['originally_missing_in_dataset'] = miss # Uncomment if it's useful to know which file it came from
    df_all_raw_texts = df_all_raw_texts.append(df, ignore_index=True)

# Combine that with the labels given to each answer

In [24]:
wc1 = set(df_all_raw_texts.wave_code.values)
wc2 = set(issue_df.wave_code.values)

print("We have the labels but not the text:")
print(wc2.difference(wc1),'\n')

print("We have the text but not the labels:")
print(wc1.difference(wc2))

We have the labels but not the text:
{'ZA5335', 'ZA5336', 'ZA5338', 'ZA5337', 'ZA5334'} 

We have the text but not the labels:
set()


In [None]:
all_codes = []
for index, row in df_all_raw_texts.iterrows():
    if index % 500 == 0:
        print(index,'done')
    lfdn = row.lfdn
    wc = row.wave_code
    df = issue_df[ (issue_df.wave_code==wc) & (issue_df.lfdn==lfdn)  ]
    codes = df.issue_code.unique()
    all_codes += [ list(codes) ]

In [26]:
df_all_raw_texts['tags'] = all_codes
df_all_raw_texts['tags'] = df_all_raw_texts['tags'].apply( lambda x: '/'.join([ i for i in x if type(i)==str ]) )
df_all_raw_texts['tags'] = df_all_raw_texts['tags'].apply( lambda x: np.nan if x=='' else x )

df_all_raw_texts.to_csv('data/C8_all_raw_texts_and_labels.csv', sep='\t', encoding='utf-8')

The reason some entries have many tags comes from waves with multiple label entries, such as:
- 'A03_c1' - 'Wichtigstes Problem: 1. Codierung'
- 'A03_c2' - 'Wichtigstes Problem: 2. Codierung'
- ...
- 'A03_c8' - 'Wichtigstes Problem: 8. Codierung'

In [28]:
df_all_raw_texts.head()

Unnamed: 0,lfdn,text,time_window,wave_code,answer_key,tags
0,177.0,99,2009-09-18 to 2009-09-27,ZA5339,a03s,
1,178.0,Die arbeitslosigkeit.,2009-09-18 to 2009-09-27,ZA5339,a03s,labor market
2,179.0,ka,2009-09-18 to 2009-09-27,ZA5339,a03s,
3,180.0,Dass die Finanzen geordnet werden und keine Ne...,2009-09-18 to 2009-09-27,ZA5339,a03s,budget and debt
4,181.0,Schulpolitik sollte besser sein,2009-09-18 to 2009-09-27,ZA5339,a03s,education
