In [6]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

category_map={'misc.forsale':'Sales','rec.motorcycles':'Motorcycles','rec.sport.baseball':'Baseball','sci.crypt':'Cryptography','sci.space':'Space'}

# Loading the training data based on the above categories...
training_data = fetch_20newsgroups(subset="train", categories=category_map.keys(), shuffle=True, random_state=7)
for i in range(6):
    print(training_data.data[i])

# Importing the feature extractor
cv = CountVectorizer()
# TC: term count
xtrain_tc = cv.fit_transform(training_data.data)

# Training the classifier using multinomial naive bayes classifier

# Defining random input sentences
input_data = ['The curveballs of right handed pitchers tend to curve motorcycle to the left', 
              'Caesar cipher is an ancient form of encryption']

# Define tfidf transformer object and train it..
tfidf_obj = TfidfTransformer()
xtrain_tfidf = tfidf_obj.fit_transform(xtrain_tc)

# Train the multi-nomial naive bayes classifier using above feature vectors
classifier = MultinomialNB().fit(xtrain_tfidf, training_data.target)

# Transform the input data using this word count
input_data_tc = cv.transform(input_data)
input_data_tfidf = tfidf_obj.transform(input_data_tc)

# Predict classifier
pc = classifier.predict(input_data_tfidf)
for sentence, category in zip(input_data, pc):
    print(sentence, "-->", training_data.target_names[category])


From: demers@cs.ucsd.edu (David DeMers)
Subject: Re: Montreal Question.......
Organization: CSE Dept., UC San Diego
Lines: 13
Nntp-Posting-Host: mbongo.ucsd.edu


In article <1993Apr19.015442.15723@oz.plymouth.edu>, k_mullin@oz.plymouth.edu (Mully) writes:
|> 
|>    What position does Mike Lansing play?  I cannot seem to find it 
|>  anywhere.  Thanks!!!!1

He's a shortstop by training, but he's been at second (mostly) and third
this year for the Expos.
-- 
Dave DeMers			 	        demers@cs.ucsd.edu
Computer Science & Engineering	0114		demers%cs@ucsd.bitnet
UC San Diego					...!ucsd!cs!demers
La Jolla, CA 92093-0114	(619) 534-0688, or -8187, FAX: (619) 534-7029

From: bclarke@galaxy.gov.bc.ca
Subject: Re: First Bike??
Organization: BC Systems Corporation
Lines: 8

In article <0forqFa00iUzMATnMz@andrew.cmu.edu>, James Leo Belliveau <jbc9+@andrew.cmu.edu> writes:
>     I am a serious motorcycle enthusiast without a motorcycle, and to
> put it bluntly, it sucks.  I really would like some 

In [5]:
import nltk
nltk.download('names')
import random
from nltk.corpus import names
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy

def gender_features(word,n=2):
    return ({'feature': word[-n:].lower()})

labels=[(name, 'male') for name in names.words('male.txt')]+[(name, 'female') for name in names.words('female.txt')]

print(labels[1:3])

data=['Leonardo', 'Amy', 'Sem', 'levely', 'king', 'mira', 'jay', 'ram', 'sita', 'priti', 'mahaveer']

feature_set=[(gender_features(n,3), gender) for (n,gender) in labels]

#Split
training, testing=feature_set[500:], feature_set[200:]

#model
classifier=NaiveBayesClassifier.train(training)

#EVALUATE
print(accuracy(classifier, testing)*100)

for name in data:
    print(name,'-->', classifier.classify(gender_features(name,5)))

[('Aaron', 'male'), ('Abbey', 'male')]
84.81404958677686
Leonardo --> female
Amy --> female
Sem --> female
levely --> female
king --> female
mira --> female
jay --> male
ram --> male
sita --> female
priti --> female
mahaveer --> female


[nltk_data] Downloading package names to
[nltk_data]     C:\Users\MCA2\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
