### Clean Stack Exchange post text

In [2]:
# Import packages.
import numpy as np
import pandas as pd

import re
import nltk
from nltk.corpus import wordnet

In [None]:
#test = pd.read_csv("/Users/christinachang/Documents/STA141C/sta-141c-classify/data/biology.csv")

In [15]:
def tokenize_text(doc):
    """Combine the strings in the "response" column of dataframe df into one long string. Then, tokenize the
    string and make all words lowercase."""

    # Tokenize and make lowercase.
    words = nltk.word_tokenize(doc)
    words = [w.lower() for w in words]
    
    return words


def wordnet_pos(tag):
    """Map a Brown POS tag to a WordNet POS tag."""
    
    table = {"N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV, "J": wordnet.ADJ}
    
    # Default to a noun.
    return table.get(tag[0], wordnet.NOUN)


def lemmatize_text(words):
    """Lemmatize words to get the base words. The input 'words' is a list of of words."""
    
    lemmatizer = nltk.WordNetLemmatizer()
    word_tags = nltk.pos_tag(words)
    words = [lemmatizer.lemmatize(w, wordnet_pos(t)) for (w, t) in word_tags]
    
    return words


def remove_stopwords(words):
    """Remove stopwords from a string."""
    
    stopwords = nltk.corpus.stopwords.words("english")
    words = [w for w in words if w not in stopwords]
    
    return words

def clean_text(doc):    
    """Tokenize, lemmatize, and remove stopwords for the text of all articles."""
    
    words = re.sub("< ?/?[a-z]+ ?>|\n", "", doc)
    words = tokenize_text(words)
    words = lemmatize_text(words)
    words = remove_stopwords(words)
    doc = [w for w in words if w.isalnum()]
    doc = ' '.join(doc)
    
    return doc

def clean_df(df):
    """Combine the title and content of each post into one string and clean each string."""
    text = df['title'] + " " + df['content']
    df_clean = pd.DataFrame([clean_text(i) for i in text])
    df_clean.columns = ["text"]
    df_clean["tags"] = df["tags"]
    return df_clean

In [67]:
# Stack exchange topic names
names = ["biology","cooking","crypto","diy","robotics","travel"]

def get_paths(name):
    """Get path names for each file."""
    path = "data/"+name+".csv"
    return path

# Get path names
paths = [get_paths(i) for i in names]

# All data frames in a list.
dfs = [pd.read_csv(i) for i in paths]

# Get a list of the cleaned data frames.
clean_dfs = [clean_df(i) for i in dfs]

# Save cleaned dfs as csv
for i in range(len(names)):
    clean_dfs[i].to_pickle(names[i]+"_clean.csv")

In [57]:
# Test with subset of data.
subsets = [df[0:50] for df in dfs]
tmp1 = [clean_df(i) for i in subsets]

# See the file
pd.read_pickle("cleaned/biology_clean.dat")

In [5]:
data = pd.read_pickle("cleaned_trim/biology_trim_clean.dat")
data

Unnamed: 0,text,tags
0,rnase contamination rna base experiment preven...,rna biochemistry
1,lymphocyte size cluster two group tortora writ...,immunology cell-biology hematology
2,avoid digest dna interested sequence analyze b...,dna biochemistry molecular-biology
3,condition dendritic spine form look resource i...,neuroscience synapses
4,reason behind choose reporter gene experiment ...,molecular-genetics gene-expression experimenta...
5,many time endosymbiosis occur accord endosymbi...,evolution mitochondria chloroplasts
6,anyone try gibson assembly optimization anyone...,molecular-biology synthetic-biology
7,optimal frame size protein secondary structure...,bioinformatics homework
8,main mechanism interaction nervous immune syst...,neuroscience immunology
9,understand influenza strain designation strain...,microbiology virology influenza


In [28]:
split_list = [tags.split(" ") for tags in list(data['tags'])]

In [30]:
split_list
split = [item for sublist in split_list for item in sublist]

In [31]:
unique = list(set(split))

In [43]:
tags_df = pd.DataFrame(columns = unique)

joined = data.join(tags_df).fillna(0)

In [81]:
tags_only = joined.iloc[:, 2:np.shape(joined)[1]]

TypeError: cannot do slice indexing on <class 'pandas.core.indexes.range.RangeIndex'> with these indexers [(10408, 657)] of <class 'tuple'>

In [82]:
split_list

for i in range(np.shape(tags_only)[0]):
    tags_only.iloc[i].loc[split_list[i]] = 1

In [79]:
tags_only.iloc[0].loc[split_list[0]]

rna             1
biochemistry    1
Name: 0, dtype: int64

In [80]:
split_list[0]



['rna', 'biochemistry']

In [86]:
data.iloc[:, 2:np.shape(joined)[1]] = tags_only

In [92]:
final = data.join(tags_only)
final

Unnamed: 0,text,tags,bioinorganic-chemistry,odour,chirality,dendrology,ultrasound,analgesia,aids,genomes,...,teeth,cell-biology,ncbi,peer-review-journal,physiology,lipids,elisa,medium,phylogenetics,surgery
0,rnase contamination rna base experiment preven...,rna biochemistry,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,lymphocyte size cluster two group tortora writ...,immunology cell-biology hematology,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,avoid digest dna interested sequence analyze b...,dna biochemistry molecular-biology,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,condition dendritic spine form look resource i...,neuroscience synapses,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,reason behind choose reporter gene experiment ...,molecular-genetics gene-expression experimenta...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,many time endosymbiosis occur accord endosymbi...,evolution mitochondria chloroplasts,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,anyone try gibson assembly optimization anyone...,molecular-biology synthetic-biology,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,optimal frame size protein secondary structure...,bioinformatics homework,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,main mechanism interaction nervous immune syst...,neuroscience immunology,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,understand influenza strain designation strain...,microbiology virology influenza,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(final, random_state=42, test_size=0.30, shuffle=True)



In [94]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(train['text'])
vectorizer.fit(test['text'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [95]:
x_train = vectorizer.transform(train['text'])
y_train = train.drop(labels = ['tags','text'], axis=1)

x_test = vectorizer.transform(test['text'])
y_test = test.drop(labels = ['tags','text'], axis=1)

In [97]:
categories = list(unique)

In [98]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier

# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])

for category in categories:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[category])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print("\n")

**Processing bioinorganic-chemistry comments...**
Test accuracy is 1.0


**Processing odour comments...**
Test accuracy is 0.9996797950688441


**Processing chirality comments...**
Test accuracy is 0.9993595901376882


**Processing dendrology comments...**
Test accuracy is 0.9990393852065321


**Processing ultrasound comments...**
Test accuracy is 0.9990393852065321


**Processing analgesia comments...**
Test accuracy is 0.9996797950688441


**Processing aids comments...**
Test accuracy is 0.9987191802753762


**Processing genomes comments...**
Test accuracy is 0.9967979506884406


**Processing waste-disposal comments...**
Test accuracy is 1.0


**Processing information comments...**
Test accuracy is 0.9996797950688441


**Processing dopamine comments...**
Test accuracy is 0.9996797950688441


**Processing cycle comments...**
Test accuracy is 1.0


**Processing cpg comments...**
Test accuracy is 0.9993595901376882


**Processing drosophila comments...**
Test accuracy is 0.9990393852065

Test accuracy is 0.9996797950688441


**Processing dreaming comments...**
Test accuracy is 0.9987191802753762


**Processing dehydration comments...**
Test accuracy is 0.9996797950688441


**Processing osmoregulation comments...**
Test accuracy is 0.9996797950688441


**Processing identification comments...**
Test accuracy is 0.9958373358949728


**Processing software comments...**
Test accuracy is 0.9971181556195965


**Processing eukaryotic-cells comments...**
Test accuracy is 0.9980787704130644


**Processing natural-selection comments...**
Test accuracy is 0.9865513928914506


**Processing inflammation comments...**
Test accuracy is 0.9987191802753762


**Processing blast comments...**
Test accuracy is 0.9961575408261287


**Processing trees comments...**
Test accuracy is 0.9967979506884406


**Processing amino-acids comments...**
Test accuracy is 0.9926352865834134


**Processing community-ecology comments...**
Test accuracy is 0.9977585654819084


**Processing organs comments...*

KeyboardInterrupt: 

In [101]:
# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))

MemoryError: 