In [1]:
#################
### Logistic Regression with Sklearn
#################
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets, model_selection, naive_bayes, metrics, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer


### define input data
categories = [
  'alt.atheism',
  'talk.religion.misc',
  'comp.graphics',
  'sci.space' ]

data_train = datasets.fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42)

data_test = datasets.fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42)


print(data_train.target.shape) # number of training samples
print(data_test.target.shape) # number of test samples

No handlers could be found for logger "sklearn.datasets.twenty_newsgroups"


(2034,)
(1353,)


In [7]:
print(data_train.data[0])

From: rych@festival.ed.ac.uk (R Hawkes)
Subject: 3DS: Where did all the texture rules go?
Lines: 21

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych

Rycharde Hawkes				email: rych@festival.ed.ac.uk
Virtual Environment Laboratory
Dept. of Psychology			Tel  : +44 31 650 3426
Univ. of Edinburgh			Fax  : +44 31 667 0150



In [12]:
### parsing text files
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') #Tfidf : text-processing function
X_train = vectorizer.fit_transform(data_train.data) # fit(=learning) and transform
X_test = vectorizer.transform(data_test.data)

print(X_train.shape) # (samples, # features)

feature_names = vectorizer.get_feature_names()

# make model
# alpha: pseudocount
model = naive_bayes.MultinomialNB(alpha=0.01)  
# alpha smoothing : prior
# What alpha does? It's kind of pseudo counting.
# Bayesian calculates by multiplication of probabilities,
# so if any probability is 0, then all value turns into zero.
# so alpha prevents from it.

# training model
model.fit(X_train, data_train.target)

# test model
pred = model.predict(X_test)

accuracy = metrics.accuracy_score(data_test.target, pred)

(2034, 33809)


In [11]:
feature_names[0]

u'00'