In [32]:
import pandas as pd
from tqdm.notebook import tqdm

from transformers import BertModel, BertTokenizer
import torch
import glob, os, re

from parallel_pandas import ParallelPandas
from functools import partial

import helpers

In [2]:
ParallelPandas.initialize(n_cpu=23, split_factor=4)

## Load in transcripts

In [3]:
files = glob.glob("../raw/[0-9]*.csv")

In [4]:
dfs = []
for fi in tqdm(files):
    try:
        df = pd.read_csv(fi, index_col=0)
    except Exception:
        df = pd.read_csv(fi, encoding="windows-1252", index_col=0)
    dfs.append(df)
full_data = pd.concat(dfs, axis = 0)

  0%|          | 0/701 [00:00<?, ?it/s]

In [5]:
full_data.head()

Unnamed: 0,LineNo,Interviewer,Statement,Question,Answer,Vol,Page,Area,Witness,Location
0,1291,The Chairman.,,What is your occupation ?,"My son is a crofter, and I am in my son's plac...",1.0,73.0,"Skye, Skeabost",Hugh Mcnab,"Kildonan, Lynedale"
1,1292,The Chairman.,,How far back with the previous landlord?,Thirty-four years ago.,1.0,73.0,"Skye, Skeabost",Hugh Mcnab,"Kildonan, Lynedale"
2,1293,Mr Fraser-Mackintosh.,,We will take the present landlord?,The present landlord raised our rent 31s. and ...,1.0,73.0,"Skye, Skeabost",Hugh Mcnab,"Kildonan, Lynedale"
3,1294,The Chairman.,,Will you state the amount of stock which your ...,In summer and autumn we could keep two cows an...,1.0,73.0,"Skye, Skeabost",Hugh Mcnab,"Kildonan, Lynedale"
4,1295,Sheriff Nicolson.,,When were the sheep taken from you?,The landlord took the sheep stock from us a fe...,1.0,73.0,"Skye, Skeabost",Hugh Mcnab,"Kildonan, Lynedale"


In [6]:
place_constructs = pd.read_csv("../data/place_identity_community.csv")
place_constructs = place_constructs.dropna()
place_constructs

Unnamed: 0,question_text,type
0,I am very attached to this community,identity
1,This community is very special to me,identity
2,I identify strongly with this community,identity
3,I feel this community is a part of me,identity
4,This community means a lot to me,identity
5,Living in this community says a lot about who ...,identity
6,I identify strongly with this place,identity
7,The town is like a part of myself,identity
8,This factor is part of my identity,identity
9,My bonds to this part of town are strong,identity


In [7]:

place_constructs = pd.read_csv("../data/place_identity_land.csv")
place_constructs = place_constructs.dropna()
place_constructs

Unnamed: 0,question_text,type
0,I am very attached to this land,identity
1,This land is very special to me,identity
2,I identify strongly with this land,identity
3,I feel this land is a part of me,identity
4,This land means a lot to me,identity
5,Living in this land says a lot about who I am,identity
6,I identify strongly with this land,identity
7,The land is like a part of myself,identity
8,This land is part of my identity,identity
9,My bonds to this part of land are strong,identity


In [8]:
full_text = full_data.dropna(subset=["Answer"]).groupby(["Area", "Location", "Witness"])["Answer"].apply(lambda x: "\n".join(x)).reset_index(name="text")
full_text.head()

Unnamed: 0,Area,Location,Witness,text
0,"Argyll, Bunessan, Mull",Ardalanish,Alexander Mcintyre,For the last 300 years.\nNo; they were born an...
1,"Argyll, Bunessan, Mull",Ardtun,"Duncan Mclean, Lachlan Macdonald",No.\nThe only thing we have to say is that we ...
2,"Argyll, Bunessan, Mull",Bunessan,"Alexander Mckechnie, MB, CM (Glasgow)",Over six years.\nNo; I was in Inverness.\nI ha...
3,"Argyll, Bunessan, Mull",Bunessan,Neil Matheson,"I have been a fisherman from my youth, and th..."
4,"Argyll, Bunessan, Mull",Catchean,"John Mccormick, John Mckinnon",Nine.\n. There are some of them sub-tenants.\n...


In [9]:
full_text = full_data.dropna(subset=["Answer"])
full_text.head()

Unnamed: 0,LineNo,Interviewer,Statement,Question,Answer,Vol,Page,Area,Witness,Location
0,1291,The Chairman.,,What is your occupation ?,"My son is a crofter, and I am in my son's plac...",1.0,73.0,"Skye, Skeabost",Hugh Mcnab,"Kildonan, Lynedale"
1,1292,The Chairman.,,How far back with the previous landlord?,Thirty-four years ago.,1.0,73.0,"Skye, Skeabost",Hugh Mcnab,"Kildonan, Lynedale"
2,1293,Mr Fraser-Mackintosh.,,We will take the present landlord?,The present landlord raised our rent 31s. and ...,1.0,73.0,"Skye, Skeabost",Hugh Mcnab,"Kildonan, Lynedale"
3,1294,The Chairman.,,Will you state the amount of stock which your ...,In summer and autumn we could keep two cows an...,1.0,73.0,"Skye, Skeabost",Hugh Mcnab,"Kildonan, Lynedale"
4,1295,Sheriff Nicolson.,,When were the sheep taken from you?,The landlord took the sheep stock from us a fe...,1.0,73.0,"Skye, Skeabost",Hugh Mcnab,"Kildonan, Lynedale"


In [29]:
tqdm.pandas(desc="Cosine sim")
cosine_sims = full_text["Answer"].p_apply(helpers.compute_place_identity_measure)

COMPUTE_PLACE_IDENTITY_MEASURE DONE:   0%|          | 0/43678 [00:00<?, ?it/s]

In [30]:
pd.concat([full_text, cosine_sims], axis = 1).to_csv("cosine_sim.csv", index = False)