In [1]:
import pandas as pd
import numpy as np 
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression

In [2]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('vectorizer', CountVectorizer(stop_words='english', max_df=0.8)),
('tfidf', TfidfTransformer()),
('classifier', LogisticRegression(solver='newton-cg', multi_class='multinomial'))])  # multiclass based

In [3]:
df = pd.read_csv('author_article_gender.csv').dropna()

In [4]:
df.size

15712

In [5]:
df.columns


Index(['Unnamed: 0', 'author', 'article', 'gender'], dtype='object')

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,author,article,gender
0,0,bob tedeschi,Smilebox.com offers customers ability to build...,m
1,1,sharon waxman,Judith Regan says she is preparing to file law...,f
2,2,kathryn shattuck,11 A.M. (HGTV) ROSE PARADE 2007 -- HGTV presen...,f
3,3,nate chinen,Nate Chinen reviews performance by Gov't Mule ...,m
4,4,julie galambush,Julie Galambush reviews book The Misunderstood...,f


In [7]:
dataset_df = df[['article','gender']]

In [8]:
dataset_df

Unnamed: 0,article,gender
0,Smilebox.com offers customers ability to build...,m
1,Judith Regan says she is preparing to file law...,f
2,11 A.M. (HGTV) ROSE PARADE 2007 -- HGTV presen...,f
3,Nate Chinen reviews performance by Gov't Mule ...,m
4,Julie Galambush reviews book The Misunderstood...,f
5,Drilling Down column discusses Food Marketing ...,m
6,Sirius Satellite Radio and XM Satellite Radio ...,m
7,Anonymous computer programmer may have skewed ...,f
8,Wall Street Journal will introduce redesigned ...,f
9,Snow did not fall on New York City for entire ...,m


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_X,test_X,train_Y,test_Y = train_test_split(dataset_df['article'],dataset_df['gender'],test_size = 0.1)

In [11]:
train_X.size

3535

In [12]:
test_X.size

393

In [13]:
pipeline.fit(train_X,train_Y)

Pipeline(steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
  ...ty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False))])

In [14]:
predicted_Y = pipeline.predict(test_X)
predition_probabilities = pipeline.predict_proba(test_X ) #  Article : [M , 60% , f 40% ]
print(pipeline.classes_) # This will tell you about the classes 
predition_probabilities # No. 

# test_y : True Actual Gender
# predicted_y : predicted Gender
# probabilities : 

predicted_from_percentage = []
for _ in predition_probabilities:
    #print(np.max(_))
    print("The confidence based on the following probabilities : ", _)
    if np.max(_) > 0.63: # Threshold 
        #print(pipeline.classes_[np.argmax(_)])
        print("I am cofident for :",pipeline.classes_[np.argmax(_)]) # Printing the confidence for the specific class
        predicted_from_percentage.append(pipeline.classes_[np.argmax(_)])
    else:
        print("Null")
        predicted_from_percentage.append('NULL')

print("Total testing Size: ",len(test_X)) # total testing size
true_positive = 0
null_prediction = 0 
false_positive = 0 
for (predicted, actual) in zip(predicted_from_percentage,test_Y):
    if predicted == 'NULL':
        #print(predicted,actual)
        null_prediction += 1 
    elif predicted.strip() == actual.strip():
        true_positive += 1 # true positive rate 
    else:
        false_positive += 1# false positive rate
print("Total Predicted : ", (true_positive+ false_positive))
print("true prediction Count : ",true_positive)
print("False prediction count : ",false_positive)
print("Accuracy :", float(true_positive)/(float(true_positive)+float(false_positive)))
print("Null count: ",null_prediction)



# Dataset
# Dataset --> 2 (Train and test data) : 80 % Train , 20 % test
# Test ( Confusion Matrix and Accuracy score)
# comes form test 

['f' 'm']
The confidence based on the following probabilities :  [ 0.50038257  0.49961743]
Null
The confidence based on the following probabilities :  [ 0.5858052  0.4141948]
Null
The confidence based on the following probabilities :  [ 0.35357546  0.64642454]
I am cofident for : m
The confidence based on the following probabilities :  [ 0.57035386  0.42964614]
Null
The confidence based on the following probabilities :  [ 0.45819353  0.54180647]
Null
The confidence based on the following probabilities :  [ 0.41689953  0.58310047]
Null
The confidence based on the following probabilities :  [ 0.39098379  0.60901621]
Null
The confidence based on the following probabilities :  [ 0.28339204  0.71660796]
I am cofident for : m
The confidence based on the following probabilities :  [ 0.48038578  0.51961422]
Null
The confidence based on the following probabilities :  [ 0.45452627  0.54547373]
Null
The confidence based on the following probabilities :  [ 0.46055691  0.53944309]
Null
The confiden

In [67]:
new_df = pd.DataFrame()
new_df['article'] = test_X
new_df['actual_gender'] = test_Y
new_df['predicted_gender'] = predicted_Y

In [68]:
new_df['score'] = new_df['actual_gender'] == new_df['predicted_gender']

In [69]:
new_df

Unnamed: 0,article,actual_gender,predicted_gender,score
669,Introduced on Sunday: Jaguar C-XF design study...,m,m,True
195,Bob Herbert Op-Ed column lambastes Pres Bush f...,m,m,True
864,Charles Isherwood reviews Second Stage Theater...,f,m,False
1088,Living In article on Park Slope section of Bro...,m,f,False
3777,For the first time since the network newscasts...,m,m,True
958,Military operations in Somalia by US commandos...,m,m,True
3687,Stuart Goldman is charged in Camden NJ with ...,f,f,True
1173,China signs trade agreement on services with A...,m,m,True
780,Administration of New York Gov Eliot Spitzer m...,m,m,True
3781,BBC names longtime television executive Garth ...,m,m,True


### Support Vector Classification for Article on the same dataset 

In [70]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('vectorizer', CountVectorizer(stop_words='english', max_df=0.8)),
('tfidf', TfidfTransformer()),
('classifier', LinearSVC())])  # multiclass based

In [71]:
pipeline.fit(train_X,train_Y)

Pipeline(steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
  ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [72]:
predicted_y = pipeline.predict(test_X)
predition_probabilities = pipeline.predict_proba(test_X ) #  Article : [M , 60% , f 40% ]
# Max (m ya female : ---> 75% predict other wise leave that thing )
# ARTICLE     -------- > [m : 60%,  f : 40% ] ---> m : 60% chance (threshold : 75% predict )
# Error rate : 10 atleast [m : 51, f : 49] --> m that is the reason that I would prefer you should use this function in your
# algorithm so that you can improve you accuracy a little high

# P: 1000 FIles :
# 85% percent 
# Predict : 300 Files  
# Null : 700 Files (later used )
new_df = pd.DataFrame()
new_df['article'] = test_X
new_df['actual_gender'] = test_Y
new_df['predicted_gender'] = predicted_Y
new_df['score'] = new_df['actual_gender'] == new_df['predicted_gender']
actual_females,actual_males,predicted_males,predicted_females = 0,0 , 0 , 0
for _ in new_df['actual_gender']:
    if str(_).strip() == 'f':
        actual_females += 1
    else:
        actual_males += 1 
for _ in new_df['predicted_gender']:
    if str(_).strip() == 'f':
        predicted_females += 1
    else:
        predicted_males += 1 
        
print("Actual Male - Female ",actual_males," \t ", actual_females)
print("predicted Male - Female ",predicted_males, '\t', predicted_females)


Actual Male - Female  237  	  156
predicted Male - Female  293 	 100


In [21]:
new_df

Unnamed: 0,article,actual_gender,predicted_gender,score
825,Damon Darlin Reporters' Notebook column on gli...,m,m,True
809,Pres Bush casts aside his brief diplomatic app...,f,m,False
3877,China has gone from having virtually no indepe...,m,m,True
2580,Jean Kane is shot to death in her car in State...,f,f,True
1198,Teresa Flores and Rufino Bedoya-Flores are cha...,f,f,True
1105,American Association of Orthodontists survey s...,f,m,False
2934,Oil services company Schlumberger Ltd announce...,m,m,True
2006,Jennifer Dunning reviews performance by Aspen ...,f,f,True
3737,New York Giants cut Luke Petitgout LaVar Arri...,f,m,False
353,Article on Friday Night Face-Off weekly impro...,f,m,False


In [22]:
from sklearn.metrics import accuracy_score
linear_score = accuracy_score(test_Y,predicted_y)

In [23]:
linear_score

0.7048346055979644

In [50]:
pipeline = Pipeline([('vectorizer', CountVectorizer(stop_words='english', min_df=10,max_df=0.7)),
('tfidf', TfidfTransformer()),
('classifier', LinearSVC())])  # multiclass based

In [51]:
from sklearn.model_selection import cross_val_score
cv_score = cross_val_score(pipeline,dataset_df['article'],dataset_df['gender'],cv = 10)
print("min score : ",np.min(cv_score))
print("mean score : ",np.mean(cv_score))
print("max score : ",np.max(cv_score))

min score :  0.625954198473
mean score :  0.67438723581
max score :  0.725190839695
