problems found debugged (#11)

* problems found debugged * accuracy improved * Update data_sort.py
findexchange · Oct 14, 2016 · 1fdd27a · 1fdd27a
1 parent 9dc6cc0
commit 1fdd27a
Showing 1 changed file with 7 additions and 7 deletions.
diff --git a/src/data_sort.py b/src/data_sort.py
@@ -51,6 +51,7 @@ def lower_clean_suffix(strings):
 	@staticmethod		
 	def replace_no_char(strings):
 		strings = strings.replace('- ', '')
+		strings = strings.replace(' -','')
 		strings = strings.replace('&', '')
 		strings = strings.replace('(', '')
 		strings = strings.replace(')',"")
@@ -88,7 +89,7 @@ def get_clean_names(self):
 		dataframe['name'] = dataframe['name'].apply(self.lower_clean_suffix)
 		dataframe['name'] = dataframe['name'].apply(self.replace_no_char)
 		dataframe['name'] = dataframe['name'].apply(self.add_break)
-		return dataframe['name'].reset_index().drop('index', axis = 1)
+		return dataframe['name']
 
 
 	#break down all the names in to a list
@@ -146,22 +147,21 @@ def place_nan(string):
 		if len(string) == 0:
 			string = np.nan
 		return string
-
 
 	#put all together, return a dataframe with all keywords	that has frequency larger than 2
 	#They are treated as tags
 	def aggreate_all(self, n = 2):
-		name_df = self.get_clean_names()
+		name_series = self.get_clean_names()
 		address_series = self.get_clean_address()
-		if len(address_series)==0: pass
+		if len(address_series) == 0: pass
 		#Delete the names containing postal adderss
 		else:
 			for i in address_series:
 				check_word = i
-				if np.any(name_df[name_df.name.str.contains(check_word)]['name'] != pd.Series.empty):
-					name_df.loc[name_df.name.str.contains(check_word), 'name'] = name_df.loc[name_df.name.str.contains(check_word), 'name'].str.replace(check_word, '')
+				if np.any(name_series[name_series.str.contains(check_word)] != pd.Series.empty): 
+					name_series.loc[name_series.str.contains(check_word)] = name_series.loc[name_series.str.contains(check_word)].str.replace(check_word, '')
 
-		one_word = self.one_word_list(name_df.name)
+		one_word = self.one_word_list(name_series)
 		unigrams = self.delete_nonsense(Counter(self.get_ngram(one_word,1)))
 		bigrams = self.delete_nonsense(Counter(self.get_ngram(one_word,2)))
 		trigrams = self.delete_nonsense(Counter(self.get_ngram(one_word, 3)))