Skip to content

Commit

Permalink
problems found debugged (#11)
Browse files Browse the repository at this point in the history
* problems found debugged

* accuracy improved

* Update data_sort.py
  • Loading branch information
XiaoPa authored and maZahaca committed Oct 14, 2016
1 parent 9dc6cc0 commit 1fdd27a
Showing 1 changed file with 7 additions and 7 deletions.
14 changes: 7 additions & 7 deletions src/data_sort.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def lower_clean_suffix(strings):
@staticmethod
def replace_no_char(strings):
strings = strings.replace('- ', '')
strings = strings.replace(' -','')
strings = strings.replace('&', '')
strings = strings.replace('(', '')
strings = strings.replace(')',"")
Expand Down Expand Up @@ -88,7 +89,7 @@ def get_clean_names(self):
dataframe['name'] = dataframe['name'].apply(self.lower_clean_suffix)
dataframe['name'] = dataframe['name'].apply(self.replace_no_char)
dataframe['name'] = dataframe['name'].apply(self.add_break)
return dataframe['name'].reset_index().drop('index', axis = 1)
return dataframe['name']


#break down all the names in to a list
Expand Down Expand Up @@ -146,22 +147,21 @@ def place_nan(string):
if len(string) == 0:
string = np.nan
return string


#put all together, return a dataframe with all keywords that has frequency larger than 2
#They are treated as tags
def aggreate_all(self, n = 2):
name_df = self.get_clean_names()
name_series = self.get_clean_names()
address_series = self.get_clean_address()
if len(address_series)==0: pass
if len(address_series) == 0: pass
#Delete the names containing postal adderss
else:
for i in address_series:
check_word = i
if np.any(name_df[name_df.name.str.contains(check_word)]['name'] != pd.Series.empty):
name_df.loc[name_df.name.str.contains(check_word), 'name'] = name_df.loc[name_df.name.str.contains(check_word), 'name'].str.replace(check_word, '')
if np.any(name_series[name_series.str.contains(check_word)] != pd.Series.empty):
name_series.loc[name_series.str.contains(check_word)] = name_series.loc[name_series.str.contains(check_word)].str.replace(check_word, '')

one_word = self.one_word_list(name_df.name)
one_word = self.one_word_list(name_series)
unigrams = self.delete_nonsense(Counter(self.get_ngram(one_word,1)))
bigrams = self.delete_nonsense(Counter(self.get_ngram(one_word,2)))
trigrams = self.delete_nonsense(Counter(self.get_ngram(one_word, 3)))
Expand Down

0 comments on commit 1fdd27a

Please sign in to comment.