dedupeio · RobKraft · Mar 4, 2021 · Mar 5, 2021 · Mar 7, 2021 · Mar 7, 2021
diff --git a/.gitignore b/.gitignore
@@ -26,3 +26,24 @@ ENV
 distpgsql_init_db.py
 pgsql_example/pgsql_init_db.py
 .idea
+/.vs/dedupe-examples/v16/.suo
+/.vs/slnx.sqlite
+/dedupe-examples.pyproj.user
+/s3_csv_example/combinedfile.csv
+/s3_csv_example/s3thirdfile.csv
+/s3_csv_example/s3secondfile.csv
+/s3_csv_example/Mappings.csv
+/combinedfile.csv
+/CBKC01.sorted.Duplicate_Vaccination_Signups.2021.04.19-19.46.56.csv
+/CBKC01.ResponseExportComeBackKC1.csv
+/CBKC01.Agency2.csv
+/pre_Op1.csv
+/Duplicate_Vaccination_Signups.2021.04.27-12.31.18.csv
+/logfile.csv
+/errors.csv
+/sorted.Duplicate_Vaccination_Signups.2021.04.28-16.29.57.csv
+/sorted.Duplicate_Vaccination_Signups.2021.04.28-16.06.01.csv
+/SAFE01.sorted.Duplicate_Vaccination_Signups.2021.04.28-16.29.57.csv
+/SAFE01.sorted.Duplicate_Vaccination_Signups.2021.04.28-16.06.01.csv
+/s3_csv_example_output.2021.03.28-10.49.26.zip
+/s3_csv_example_output.2021.03.27-13.45.32.csv
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,16 @@
+FROM python:3
+
+ADD KCDigitalDrive/KCDigitalDrive_Vacc.py /
+
+RUN pip install unidecode
+RUN pip install pandas
+RUN pip install dedupe
+RUN pip install boto3
+
+COPY KCDigitalDrive/Mappings.csv ./
+COPY SAFE01TestData.csv ./
+COPY KCDigitalDrive/Duplicate_Vaccination_Signups_learned_settings ./
+COPY KCDigitalDrive/Duplicate_Vaccination_Signups_training.json ./
+
+
+CMD ["python", "./KCDigitalDrive_Vacc.py", "wildrydes-rob-kraft", "pp4ncaliftwo", "local"]
diff --git a/KCDigitalDrive/Duplicate_Vaccination_Signups_learned_settings b/KCDigitalDrive/Duplicate_Vaccination_Signups_learned_settings
diff --git a/KCDigitalDrive/Duplicate_Vaccination_Signups_training.json b/KCDigitalDrive/Duplicate_Vaccination_Signups_training.json
diff --git a/KCDigitalDrive/KCDigitalDrive_Vacc.py b/KCDigitalDrive/KCDigitalDrive_Vacc.py
diff --git a/KCDigitalDrive/Mappings.csv b/KCDigitalDrive/Mappings.csv
@@ -0,0 +1,4 @@
+Source,FirstName,LastName,Email,City,Phone,Zip,Unique ID,Key
+comebackkc1,First Name,Last Name,Email address,City,"Phone number (please enter numbers only, no dashes, spaces, or parentheses)",What is your zip code?,Response Reference ID,Receipt Number
+CBKC01,First Name,Last Name,Email address,City,"Phone number (please enter numbers only, no dashes, spaces, or parentheses)",What is your zip code?,Response Reference ID,Receipt Number
+SAFE01,First Name,Last Name,Email Address,"What city do you live in?","Home Phone Number","The zip code where you live",IP,Submission ID
diff --git a/KCDigitalDrive/requirements.txt b/KCDigitalDrive/requirements.txt
@@ -0,0 +1 @@
+unidecode
diff --git a/csv_example/csv_evaluation.py b/csv_example/csv_evaluation.py
@@ -1,5 +1,6 @@
 from future.utils import viewitems
 
+import os
 import csv
 import collections
 import itertools
@@ -41,6 +42,10 @@ def dupePairs(filename, rowname) :
 manual_clusters = 'csv_example_input_with_true_ids.csv'
 dedupe_clusters = 'csv_example_output.csv'
 
+scriptpath = os.path.dirname(__file__)
+manual_clusters = os.path.join(scriptpath, manual_clusters)
+dedupe_clusters = os.path.join(scriptpath, dedupe_clusters)
+
 true_dupes = dupePairs(manual_clusters, 'True Id')
 test_dupes = dupePairs(dedupe_clusters, 'Cluster ID')
 

diff --git a/csv_example/csv_example.py b/csv_example/csv_example.py
@@ -84,6 +84,12 @@ def readData(filename):
     settings_file = 'csv_example_learned_settings'
     training_file = 'csv_example_training.json'
 
+    scriptpath = os.path.dirname(__file__)
+    input_file = os.path.join(scriptpath, input_file)
+    output_file = os.path.join(scriptpath, output_file)
+    settings_file = os.path.join(scriptpath, settings_file)
+    training_file = os.path.join(scriptpath, training_file)
+
     print('importing data ...')
     data_d = readData(input_file)
 

diff --git a/csv_example/csv_example_learned_settings b/csv_example/csv_example_learned_settings
diff --git a/csv_example/csv_example_output.csv b/csv_example/csv_example_output.csv
diff --git a/csv_example/csv_example_training.json b/csv_example/csv_example_training.json
diff --git a/dedupe-examples.pyproj b/dedupe-examples.pyproj
@@ -0,0 +1,73 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" DefaultTargets="Build">
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <SchemaVersion>2.0</SchemaVersion>
+    <ProjectGuid>{d588098a-74ed-4cc7-a86f-26ff7f60504a}</ProjectGuid>
+    <ProjectHome />
+    <StartupFile>KCDigitalDrive\KCDigitalDrive_Vacc.py</StartupFile>
+    <SearchPath />
+    <WorkingDirectory>.</WorkingDirectory>
+    <OutputPath>.</OutputPath>
+    <ProjectTypeGuids>{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
+    <LaunchProvider>Standard Python launcher</LaunchProvider>
+    <InterpreterId />
+    <CommandLineArguments>wildrydes-rob-kraft pp4ncaliftwo local</CommandLineArguments>
+    <EnableNativeCodeDebugging>False</EnableNativeCodeDebugging>
+    <SuppressEnvironmentCreationPrompt>True</SuppressEnvironmentCreationPrompt>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)' == 'Debug'" />
+  <PropertyGroup Condition="'$(Configuration)' == 'Release'" />
+  <PropertyGroup>
+    <VisualStudioVersion Condition=" '$(VisualStudioVersion)' == '' ">10.0</VisualStudioVersion>
+  </PropertyGroup>
+  <ItemGroup>
+    <Content Include="csv_example\requirements.txt" />
+    <Content Include="Dockerfile" />
+    <Content Include="extended-variables\requirements.txt" />
+    <Content Include="gazetteer_example\requirements-1.x.txt" />
+    <Content Include="gazetteer_example\requirements-2.x.txt" />
+    <Content Include="KCDigitalDrive\combinedfile.csv" />
+    <Content Include="KCDigitalDrive\Duplicate_Vaccination_Signups_learned_settings" />
+    <Content Include="KCDigitalDrive\Duplicate_Vaccination_Signups_training.json" />
+    <Content Include="KCDigitalDrive\Mappings.csv" />
+    <Content Include="KCDigitalDrive\original.reals3_csv_example_learned_settings" />
+    <Content Include="KCDigitalDrive\original.reals3_csv_example_training.json" />
+    <Content Include="KCDigitalDrive\requirements.txt" />
+    <Content Include="mysql_example\requirements.txt" />
+    <Content Include="pgsql_big_dedupe_example\requirements.txt" />
+    <Content Include="record_linkage_example\requirements.txt" />
+    <Content Include="requirements.txt" />
+    <Content Include="s3_csv_example\requirements.txt" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="csv_example\csv_evaluation.py" />
+    <Compile Include="csv_example\csv_example.py" />
+    <Compile Include="extended-variables\officers.py" />
+    <Compile Include="gazetteer_example\gazetteer_evaluation.py" />
+    <Compile Include="gazetteer_example\gazetteer_example.py" />
+    <Compile Include="gazetteer_example\gazetteer_postgres_example.py" />
+    <Compile Include="KCDigitalDrive\KCDigitalDrive_Vacc.py" />
+    <Compile Include="mysql_example\mysql_example.py" />
+    <Compile Include="mysql_example\mysql_init_db.py" />
+    <Compile Include="patent_example\patent_evaluation.py" />
+    <Compile Include="patent_example\patent_example.py" />
+    <Compile Include="pgsql_big_dedupe_example\pgsql_big_dedupe_example.py" />
+    <Compile Include="pgsql_big_dedupe_example\pgsql_big_dedupe_example_init_db.py" />
+    <Compile Include="record_linkage_example\record_linkage_example.py" />
+    <Compile Include="record_linkage_example\record_linkage_example_evaluation.py" />
+    <Compile Include="s3_csv_example\s3_csv_example.py" />
+  </ItemGroup>
+  <ItemGroup>
+    <Folder Include="csv_example" />
+    <Folder Include="extended-variables" />
+    <Folder Include="gazetteer_example" />
+    <Folder Include="KCDigitalDrive\" />
+    <Folder Include="mysql_example" />
+    <Folder Include="s3_csv_example\" />
+    <Folder Include="patent_example" />
+    <Folder Include="pgsql_big_dedupe_example" />
+    <Folder Include="record_linkage_example" />
+  </ItemGroup>
+  <Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
+</Project>
diff --git a/dedupe-examples.sln b/dedupe-examples.sln
@@ -0,0 +1,23 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 16
+VisualStudioVersion = 16.0.31025.194
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "dedupe-examples", "dedupe-examples.pyproj", "{D588098A-74ED-4CC7-A86F-26FF7F60504A}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Release|Any CPU = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{D588098A-74ED-4CC7-A86F-26FF7F60504A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{D588098A-74ED-4CC7-A86F-26FF7F60504A}.Release|Any CPU.ActiveCfg = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {F7D1E09B-B615-4EAE-AC36-A75F5CECA2B4}
+	EndGlobalSection
+EndGlobal
diff --git a/extended-variables/officers.py b/extended-variables/officers.py
@@ -84,6 +84,12 @@ def readData(filename):
     settings_file = 'officers_settings'
     training_file = 'officers_training.json'
 
+    scriptpath = os.path.dirname(__file__)
+    input_file = os.path.join(scriptpath, input_file)
+    output_file = os.path.join(scriptpath, output_file)
+    settings_file = os.path.join(scriptpath, settings_file)
+    training_file = os.path.join(scriptpath, training_file)
+
     print('importing data ...')
     data_d = readData(input_file)
 

diff --git a/gazetteer_example/gazetteer_evaluation.py b/gazetteer_example/gazetteer_evaluation.py
@@ -43,6 +43,9 @@ def linkPairs(filename, rowname) :
 
 clusters = 'gazetteer_output.csv'
 
+scriptpath = os.path.dirname(__file__)
+clusters = os.path.join(scriptpath, clusters)
+
 true_dupes = linkPairs(clusters, 'unique_id')
 test_dupes = linkPairs(clusters, 'Cluster ID')
 

diff --git a/gazetteer_example/gazetteer_example.py b/gazetteer_example/gazetteer_example.py
@@ -84,6 +84,11 @@ def readData(filename):
     settings_file = 'gazetteer_learned_settings'
     training_file = 'gazetteer_training.json'
 
+    scriptpath = os.path.dirname(__file__)
+    output_file = os.path.join(scriptpath, output_file)
+    settings_file = os.path.join(scriptpath, settings_file)
+    training_file = os.path.join(scriptpath, training_file)
+
     canon_file = os.path.join('data', 'AbtBuy_Buy.csv')
     messy_file = os.path.join('data', 'AbtBuy_Abt.csv')
 

diff --git a/gazetteer_example/gazetteer_postgres_example.py b/gazetteer_example/gazetteer_postgres_example.py
@@ -172,6 +172,10 @@ def descriptions(datasets):
     canon_file = os.path.join('data', 'AbtBuy_Buy.csv')
     messy_file = os.path.join('data', 'AbtBuy_Abt.csv')
 
+    scriptpath = os.path.dirname(__file__)
+    canon_file = os.path.join(scriptpath, canon_file)
+    messy_file = os.path.join(scriptpath, messy_file)
+
     print('Importing raw data into the database')
     canonical = readData(canon_file)
     messy = readData(messy_file)

diff --git a/patent_example/patent_evaluation.py b/patent_example/patent_evaluation.py
@@ -1,6 +1,7 @@
 import csv
 import collections
 import itertools
+import os
 
 def evaluateDuplicates(found_dupes, true_dupes):
     true_positives = found_dupes.intersection(true_dupes)
@@ -41,6 +42,10 @@ def dupePairs(filename, colname) :
 dedupe_clusters = 'patstat_output.csv'
 manual_clusters = 'patstat_reference.csv'
 
+scriptpath = os.path.dirname(__file__)
+dedupe_clusters = os.path.join(scriptpath, dedupe_clusters)
+manual_clusters = os.path.join(scriptpath, manual_clusters)
+
 test_dupes = dupePairs(dedupe_clusters, 'Cluster ID')
 true_dupes = dupePairs(manual_clusters, 'leuven_id')
 

diff --git a/patent_example/patent_example.py b/patent_example/patent_example.py
@@ -87,6 +87,12 @@ def names(data):
     settings_file = 'patstat_settings.json'
     training_file = 'patstat_training.json'
 
+    scriptpath = os.path.dirname(__file__)
+    input_file = os.path.join(scriptpath, input_file)
+    output_file = os.path.join(scriptpath, output_file)
+    settings_file = os.path.join(scriptpath, settings_file)
+    training_file = os.path.join(scriptpath, training_file)
+
     print('importing data ...')
     data_d = readData(input_file)
 

diff --git a/record_linkage_example/record_linkage_example.py b/record_linkage_example/record_linkage_example.py
@@ -85,6 +85,13 @@ def readData(filename):
     left_file = 'AbtBuy_Abt.csv'
     right_file = 'AbtBuy_Buy.csv'
 
+    scriptpath = os.path.dirname(__file__)
+    output_file = os.path.join(scriptpath, output_file)
+    settings_file = os.path.join(scriptpath, settings_file)
+    training_file = os.path.join(scriptpath, training_file)
+    left_file = os.path.join(scriptpath, left_file)
+    right_file = os.path.join(scriptpath, right_file)
+
     print('importing data ...')
     data_1 = readData(left_file)
     data_2 = readData(right_file)

diff --git a/record_linkage_example/record_linkage_example_evaluation.py b/record_linkage_example/record_linkage_example_evaluation.py
@@ -41,6 +41,9 @@ def linkPairs(filename, rowname) :
 
 clusters = 'data_matching_output.csv'
 
+scriptpath = os.path.dirname(__file__)
+clusters = os.path.join(scriptpath, clusters)
+
 true_dupes = linkPairs(clusters, 'unique_id')
 test_dupes = linkPairs(clusters, 'Cluster ID')
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 dedupe
 Unidecode==0.4.16
 future
+pandas
diff --git a/s3_csv_example/requirements.txt b/s3_csv_example/requirements.txt
@@ -0,0 +1 @@
+unidecode