In [5]:
import numpy as np
import pandas as pd
import pickle
import re
import nltk
import nltk.corpus
import numpy as np
from nltk.corpus import wordnet
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer
plt.rcParams["figure.figsize"] = [10, 8]

In [3]:
def tokenize_text(doc):
    """Combine the strings in the "response" column of dataframe df into one long string. Then, tokenize the
    string and make all words lowercase."""

    # Tokenize and make lowercase.
    words = nltk.word_tokenize(doc)
    words = [w.lower() for w in words]
    
    return words


def wordnet_pos(tag):
    """Map a Brown POS tag to a WordNet POS tag."""
    
    table = {"N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV, "J": wordnet.ADJ}
    
    # Default to a noun.
    return table.get(tag[0], wordnet.NOUN)


def lemmatize_text(words):
    """Lemmatize words to get the base words. The input 'words' is a list of of words."""
    
    lemmatizer = nltk.WordNetLemmatizer()
    word_tags = nltk.pos_tag(words)
    words = [lemmatizer.lemmatize(w, wordnet_pos(t)) for (w, t) in word_tags]
    
    return words


def remove_stopwords(words):
    """Remove stopwords from a string."""
    
    stopwords = nltk.corpus.stopwords.words("english")
    words = [w for w in words if w not in stopwords]
    
    return words

def clean_text(doc):    
    """Tokenize, lemmatize, and remove stopwords for the text of all articles."""
    
    words = re.sub("< ?/?[a-z]+ ?>|\n", "", doc)
    words = tokenize_text(words)
    words = lemmatize_text(words)
    words = remove_stopwords(words)
    doc = [w for w in words if w.isalnum()]
    doc = ' '.join(doc)
    
    return doc

def clean_df(df):
    """Combine the title and content of each post into one string and clean each string."""
    text = df['title'] + " " + df['content']
    df_clean = pd.DataFrame([clean_text(i) for i in text])
    df_clean.columns = ["text"]
    #df_clean["tags"] = df["tags"]
    df_clean = pd.concat([df_clean, pd.DataFrame(df["tags"])],axis = 1, sort = False)
    return df_clean

In [3]:
## Functions from examine_tags.ipynb
def get_top_tags(df):
    tag_list = [tags.split(' ') for tags in df['tags']]
    flat_list = [item for sublist in tag_list for item in sublist]
    fq = nltk.FreqDist(w for w in flat_list)
    df_fq = pd.DataFrame.from_dict(fq, orient="index").reset_index()
    df_95 = df_fq[df_fq.iloc[:,1] >= np.percentile(df_fq.iloc[:,1],95)].reset_index(drop=True)
    df_95.columns = ["term", "fq"]
    return df_95

def subset_top_df(df, top_tags_df):
    """
    df: DataFrame with all posts
    top_tags_df: Data frame of top tags
    """
    tags_list = [tags.split(' ') for tags in df['tags']]
    indeces = [i for i in range(len(tags_list))
               if list(set(top_tags_df['term']) & 
                       set(tags_list[i])) != []]
    return df.loc[indeces]

In [4]:
def trim_tags_clean(df):
    top = get_top_tags(df)
    top_subset = subset_top_df(df, top).reset_index()
    return clean_df(top_subset)

In [None]:
# Stack exchange topic names
names = ["biology","cooking","crypto","diy","robotics","travel"]

def get_paths(name):
    """Get path names for each file."""
    path = "data/"+name+".csv"
    return path

# Get path names
paths = [get_paths(i) for i in names]

# All data frames in a list.
dfs = [pd.read_csv(i) for i in paths]

# Get a list of the cleaned data frames.
trim_clean_dfs = [trim_tags_clean(i) for i in dfs]

# Save cleaned dfs as csv
for i in range(len(names)):
    trim_clean_dfs[i].to_pickle(names[i]+"_trim_clean.dat")

In [6]:
data = pd.read_pickle("cleaned_trim/diy_trim_clean.dat")

In [7]:
dfT = data["tags"].str.split(" ", expand = True)
dfT

Unnamed: 0,0,1,2,3,4
0,remodeling,basement,carpentry,,
1,caulking,bathroom,,,
2,drywall,,,,
3,walls,load-bearing,structural,,
4,repair,electrical,,,
5,drywall,wallpaper,,,
6,windows,,,,
7,concrete,,,,
8,repair,radiator,pipe,,
9,electrical,fire-hazard,knob-and-tube,,


In [8]:
tags = data["tags"]
tags = list(set(" ".join(tags).split(" ")))

In [9]:
newDF = pd.DataFrame(columns=tags)

In [None]:
for x in range(dfT.count()[0]):
    someDict={}
    for d in tags:
        rowdata=list(dfT.iloc[x])
        if d in rowdata:
            someDict[d]=1
        else:
            someDict[d]=0
    newDF=newDF.append(someDict,ignore_index=True)

In [18]:
ohc = dfT.stack().str.get_dummies().sum(level=0)

In [19]:
ohc.head()

Unnamed: 0,120-240v,240v,abs,access-panel,accessibility,acid,acoustic,acrylic,addition,adhesive,...,wiring,wood,wood-filler,wood-finish,wood-finishing,wooden-furniture,woodstove,woodworking,workshop,yurt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [183]:
np.where(ohc.iloc[1,:] == 1)
ohc.columns[44]
ohc.columns[77]

(array([44, 77]),)

In [20]:
dat_ohc = pd.concat([data['text'],ohc], axis = 1, sort = False)

In [21]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(dat_ohc, random_state=42, test_size=0.30, shuffle=True)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')

train_text = train['text']
test_text = test['text']

vectorizer.fit(train_text)
vectorizer.fit(test_text)

x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['text'], axis=1)

x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['text'], axis=1)

In [23]:
split_list = [tags.split(" ") for tags in list(data['tags'])]
split = [item for sublist in split_list for item in sublist]
unique = list(set(split))
categories = unique

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier

# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])

for category in categories:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[category])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print("\n")

**Processing steel comments...**
Test accuracy is 0.9978323699421965


**Processing caulk comments...**
Test accuracy is 0.9985549132947977


**Processing pest-control comments...**
Test accuracy is 0.9981936416184971


**Processing winter comments...**
Test accuracy is 0.9996387283236994


**Processing gutters comments...**
Test accuracy is 0.9987355491329479


**Processing melamine comments...**
Test accuracy is 1.0


**Processing frame comments...**
Test accuracy is 0.9989161849710982


**Processing boiler comments...**
Test accuracy is 0.994400289017341


**Processing measuring comments...**
Test accuracy is 0.9987355491329479


**Processing gauge comments...**
Test accuracy is 1.0


**Processing recycling comments...**
Test accuracy is 0.9994580924855492


**Processing sink comments...**
Test accuracy is 0.9891618497109826


**Processing landscaping comments...**
Test accuracy is 0.9987355491329479


**Processing cistern comments...**
Test accuracy is 0.9998193641618497


**Proces

KeyboardInterrupt: 

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])
for category in categories:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    #LogReg_pipeline.fit(x_train, train[category])
    LogReg_pipeline.fit(x_train, train[category])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print("\n")

**Processing steel comments...**
Test accuracy is 0.9978323699421965


**Processing caulk comments...**
Test accuracy is 0.9985549132947977


**Processing pest-control comments...**
Test accuracy is 0.9981936416184971


**Processing winter comments...**
Test accuracy is 0.9996387283236994


**Processing gutters comments...**
Test accuracy is 0.9987355491329479


**Processing melamine comments...**


KeyboardInterrupt: 

In [2]:
# using classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression
# initialize classifier chains multi-label classifier
classifier = ClassifierChain(LogisticRegression())
# Training logistic regression model on train data
classifier.fit(x_train, y_train)
# predict
predictions = classifier.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

NameError: name 'x_train' is not defined