In [1]:
import sklearn
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from IPython.display import clear_output

In [2]:
newsgroups = fetch_20newsgroups(subset='all',remove = ('headers', 'footers', 'quotes'))
data = pd.DataFrame()
data['text'] = newsgroups.data
data['target'] = newsgroups.target
target_names = newsgroups.target_names

data

Unnamed: 0,text,target
0,\n\nI am sure some bashers of Pens fans are pr...,10
1,My brother is in the market for a high-perform...,3
2,\n\n\n\n\tFinally you said what you dream abou...,17
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3
4,1) I have an old Jasmine drive which I cann...,4
5,\n\nBack in high school I worked as a lab assi...,12
6,\n\nAE is in Dallas...try 214/241-6060 or 214/...,4
7,"\n[stuff deleted]\n\nOk, here's the solution t...",10
8,"\n\n\nYeah, it's the second one. And I believ...",10
9,\nIf a Christian means someone who believes in...,19


# Intro to sklearn and classification

## Data Cleaning and Feature Creation


In [3]:
#Making all text lowercase
data['cleaned'] = data['text'].str.lower() 
data

Unnamed: 0,text,target,cleaned
0,\n\nI am sure some bashers of Pens fans are pr...,10,\n\ni am sure some bashers of pens fans are pr...
1,My brother is in the market for a high-perform...,3,my brother is in the market for a high-perform...
2,\n\n\n\n\tFinally you said what you dream abou...,17,\n\n\n\n\tfinally you said what you dream abou...
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3,\nthink!\n\nit's the scsi card doing the dma t...
4,1) I have an old Jasmine drive which I cann...,4,1) i have an old jasmine drive which i cann...
5,\n\nBack in high school I worked as a lab assi...,12,\n\nback in high school i worked as a lab assi...
6,\n\nAE is in Dallas...try 214/241-6060 or 214/...,4,\n\nae is in dallas...try 214/241-6060 or 214/...
7,"\n[stuff deleted]\n\nOk, here's the solution t...",10,"\n[stuff deleted]\n\nok, here's the solution t..."
8,"\n\n\nYeah, it's the second one. And I believ...",10,"\n\n\nyeah, it's the second one. and i believ..."
9,\nIf a Christian means someone who believes in...,19,\nif a christian means someone who believes in...


In [4]:
import re
#Removing special characters
data['cleaned'] = data['cleaned'].str.replace('\W+', ' ')
data['cleaned'] = data['cleaned'].str.replace('\d+', '')
data

Unnamed: 0,text,target,cleaned
0,\n\nI am sure some bashers of Pens fans are pr...,10,i am sure some bashers of pens fans are prett...
1,My brother is in the market for a high-perform...,3,my brother is in the market for a high perform...
2,\n\n\n\n\tFinally you said what you dream abou...,17,finally you said what you dream about mediter...
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3,think it s the scsi card doing the dma transf...
4,1) I have an old Jasmine drive which I cann...,4,i have an old jasmine drive which i cannot us...
5,\n\nBack in high school I worked as a lab assi...,12,back in high school i worked as a lab assista...
6,\n\nAE is in Dallas...try 214/241-6060 or 214/...,4,ae is in dallas try or tech support may...
7,"\n[stuff deleted]\n\nOk, here's the solution t...",10,stuff deleted ok here s the solution to your ...
8,"\n\n\nYeah, it's the second one. And I believ...",10,yeah it s the second one and i believe that p...
9,\nIf a Christian means someone who believes in...,19,if a christian means someone who believes in ...


In [5]:
# Removing Stop Words
import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
stop = stopwords.words('english')
stop = set(stop)
data['cleaned'].apply(lambda x: [item for item in x if item not in stop])
data

Unnamed: 0,text,target,cleaned
0,\n\nI am sure some bashers of Pens fans are pr...,10,i am sure some bashers of pens fans are prett...
1,My brother is in the market for a high-perform...,3,my brother is in the market for a high perform...
2,\n\n\n\n\tFinally you said what you dream abou...,17,finally you said what you dream about mediter...
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3,think it s the scsi card doing the dma transf...
4,1) I have an old Jasmine drive which I cann...,4,i have an old jasmine drive which i cannot us...
5,\n\nBack in high school I worked as a lab assi...,12,back in high school i worked as a lab assista...
6,\n\nAE is in Dallas...try 214/241-6060 or 214/...,4,ae is in dallas try or tech support may...
7,"\n[stuff deleted]\n\nOk, here's the solution t...",10,stuff deleted ok here s the solution to your ...
8,"\n\n\nYeah, it's the second one. And I believ...",10,yeah it s the second one and i believe that p...
9,\nIf a Christian means someone who believes in...,19,if a christian means someone who believes in ...


In [6]:
# from nltk.stem import PorterStemmer

# def stem_sentences(sentence):
#     porter_stemmer = PorterStemmer()
#     tokens = sentence.split()
#     stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
#     return ' '.join(stemmed_tokens)

In [7]:
# data['text'] = data['text'].apply(stem_sentences)
# data

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data['cleaned'], data['target'], train_size=0.8,test_size=0.2)
x_train

5371      hams can legally run up to  watts it is very ...
17908     because i m a guy and most of my pillions are...
6820      didn t your operating system come with x sysv...
12806     oh my god my spelling and grammer suck i gues...
14382    l jb  ron what do you consider to be proper ch...
1842                       what s your favorite body part 
1482     does anyone on this group use this program it ...
14974     open up one of the airbag control boxes they ...
17931    i ve had similar problems downloading using wi...
5812     well i m amazed at how successful this exercis...
18490    i assume that can only be guessed at by the as...
3849      do you have a better e mail address mr lurie ...
614       both christians and non christians laugh at t...
10150       where did that idea come from it s news to me 
15077    the latest israeli proposal first proposed in ...
14205     the joystick reads in anolog values through a...
8159                     nanao call     ask for a catalo

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(x_train)
x_test_counts = count_vect.transform(x_test)
x_train_counts.shape

(15076, 92329)

In [10]:
x_test_counts.shape

(3770, 92329)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(max_features=2000)
x_train_counts = count_vect.fit_transform(x_train)
x_test_counts = count_vect.transform(x_test)
x_train_counts = x_train_counts.toarray()

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
x_test_tfidf = tfidf_transformer.transform(x_test_counts)

In [13]:
from sklearn.svm import LinearSVC
clf = LinearSVC(max_iter=2000).fit(x_train_counts, y_train)
predicted = clf.predict(x_test_counts)
np.mean(predicted == y_test)



0.5363395225464191

In [14]:
from sklearn.svm import LinearSVC
clf = LinearSVC(max_iter=2000).fit(x_train_tfidf, y_train)
predicted = clf.predict(x_test_tfidf)
np.mean(predicted == y_test)

0.6267904509283819

In [22]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted, target_names=target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.50      0.49      0.50       150
           comp.graphics       0.56      0.55      0.56       199
 comp.os.ms-windows.misc       0.66      0.60      0.63       220
comp.sys.ibm.pc.hardware       0.61      0.58      0.60       204
   comp.sys.mac.hardware       0.57      0.60      0.58       176
          comp.windows.x       0.70      0.71      0.70       187
            misc.forsale       0.77      0.73      0.75       199
               rec.autos       0.43      0.67      0.53       206
         rec.motorcycles       0.65      0.60      0.62       218
      rec.sport.baseball       0.67      0.66      0.66       201
        rec.sport.hockey       0.74      0.74      0.74       187
               sci.crypt       0.72      0.75      0.73       203
         sci.electronics       0.51      0.52      0.51       185
                 sci.med       0.68      0.69      0.69       203
         

In [16]:
tfidf_transformer = TfidfTransformer()
count_vect = CountVectorizer(max_features=2000)

x_counts = count_vect.fit_transform(data['cleaned'])
x_tfidf = tfidf_transformer.fit_transform(x_counts)

In [17]:
#initialize kmeans, choose random k points in the original array of of points
def init_kmeans(x,k):
    return [random.choice(x) for i in range(k)]

#this finds the minimum distance 
def find_min(z,x_cur):
    distances = []
    for z_i in range(len(z)):
        distances.append(np.linalg.norm(x_cur - z[z_i]))
    return distances.index(min(distances))
def plot_jclust(j_clust_a):
    for j_clust in j_clust_a:
        plt.plot(j_clust)
    plt.xlim(0, 25)
    plt.xlabel('Iteration')
    plt.ylabel('$J^{clust}$')
    plt.show

In [18]:
def kmeans(x,k):
    #Initializing K-means
    c = []
    j_clust = []  
    z = init_kmeans(x,k)
    
    while True:
        groups = [[] for new_list in range(k)]
        groups_i = [[] for new_list in range(k)]
        #Reassigning groups according to previously calculated representatives
        for x_i in range(len(x)):
            center = find_min(z,x[x_i])
            c.append(center)
            groups[center].append(x[x_i])
            groups_i[center].append(x_i)

            

        #Finding new representatives from groups
        for z_i in range(len(z)):
            z[z_i] = np.mean(groups[z_i], axis=0)

        #Calculating j_clust
        j = []
        for z_i in range(len(z)):
            for group_i, group_cur in enumerate(groups[z_i]):
                j.append(np.linalg.norm(group_cur - z[z_i])**2) 
        j_clust.append(np.sum(j)/len(x))

        #End conditions for K-means
        #Comparing j_clust to save computation time (with unchanging z_i, must have the same j_clust)
        if len(j_clust) == 0 or len(j_clust) == 1:
            continue
        #End Kmeans when j_clust is about equal
        if (round(j_clust[-1],3) == round(j_clust[-2],3)):
            break
    return z,j_clust,groups

In [19]:
z, j_clust, groups = kmeans(x_tfidf.toarray(), 10)

In [20]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=20, init='k-means++', max_iter=100, n_init=1)
km.fit(x_tfidf.toarray())

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=20, n_init=1, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [21]:
np.find(km.labels_ == 10)

AttributeError: module 'numpy' has no attribute 'find'